client: Go Coverage Report

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "bytes"
        "context"
        "encoding/json"
        "fmt"
        "strconv"
        "strings"
        "sync"
        "time"

        containersapi "github.com/containerd/containerd/api/services/containers/v1"
        diffapi "github.com/containerd/containerd/api/services/diff/v1"
        imagesapi "github.com/containerd/containerd/api/services/images/v1"
        leasesapi "github.com/containerd/containerd/api/services/leases/v1"
        namespacesapi "github.com/containerd/containerd/api/services/namespaces/v1"
        sandboxsapi "github.com/containerd/containerd/api/services/sandbox/v1"
        snapshotsapi "github.com/containerd/containerd/api/services/snapshots/v1"
        "github.com/containerd/containerd/api/services/tasks/v1"
        versionservice "github.com/containerd/containerd/api/services/version/v1"
        apitypes "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        contentproxy "github.com/containerd/containerd/v2/core/content/proxy"
        "github.com/containerd/containerd/v2/core/events"
        eventsproxy "github.com/containerd/containerd/v2/core/events/proxy"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/introspection"
        introspectionproxy "github.com/containerd/containerd/v2/core/introspection/proxy"
        "github.com/containerd/containerd/v2/core/leases"
        leasesproxy "github.com/containerd/containerd/v2/core/leases/proxy"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/containerd/v2/core/sandbox"
        sandboxproxy "github.com/containerd/containerd/v2/core/sandbox/proxy"
        "github.com/containerd/containerd/v2/core/snapshots"
        snproxy "github.com/containerd/containerd/v2/core/snapshots/proxy"
        "github.com/containerd/containerd/v2/defaults"
        "github.com/containerd/containerd/v2/pkg/dialer"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/runtime-spec/specs-go/features"
        "golang.org/x/sync/semaphore"
        "google.golang.org/grpc"
        "google.golang.org/grpc/backoff"
        "google.golang.org/grpc/credentials/insecure"
        "google.golang.org/grpc/health/grpc_health_v1"
)

func init() {
        const prefix = "types.containerd.io"
        // register TypeUrls for commonly marshaled external types
        major := strconv.Itoa(specs.VersionMajor)
        typeurl.Register(&specs.Spec{}, prefix, "opencontainers/runtime-spec", major, "Spec")
        typeurl.Register(&specs.Process{}, prefix, "opencontainers/runtime-spec", major, "Process")
        typeurl.Register(&specs.LinuxResources{}, prefix, "opencontainers/runtime-spec", major, "LinuxResources")
        typeurl.Register(&specs.WindowsResources{}, prefix, "opencontainers/runtime-spec", major, "WindowsResources")
        typeurl.Register(&features.Features{}, prefix, "opencontainers/runtime-spec", major, "features", "Features")
}

// New returns a new containerd client that is connected to the containerd
// instance provided by address
func New(address string, opts ...Opt) (*Client, error) {
        var copts clientOpts
        for _, o := range opts {
                if err := o(&copts); err != nil {
                        return nil, err
                }
        }
        if copts.timeout == 0 {
                copts.timeout = 10 * time.Second
        }

        c := &Client{
                defaultns: copts.defaultns,
        }

        if copts.defaultRuntime != "" {
                c.runtime = copts.defaultRuntime
        } else {
                c.runtime = defaults.DefaultRuntime
        }

        if copts.defaultPlatform != nil {
                c.platform = copts.defaultPlatform
        } else {
                c.platform = platforms.Default()
        }

        if copts.services != nil {
                c.services = *copts.services
        }
        if address != "" {
                backoffConfig := backoff.DefaultConfig
                backoffConfig.MaxDelay = 3 * time.Second
                connParams := grpc.ConnectParams{
                        Backoff: backoffConfig,
                }
                gopts := []grpc.DialOption{
                        grpc.WithBlock(),
                        grpc.WithTransportCredentials(insecure.NewCredentials()),
                        grpc.FailOnNonTempDialError(true),
                        grpc.WithConnectParams(connParams),
                        grpc.WithContextDialer(dialer.ContextDialer),
                        grpc.WithReturnConnectionError(),
                }
                if len(copts.dialOptions) > 0 {
                        gopts = copts.dialOptions
                }
                gopts = append(gopts, grpc.WithDefaultCallOptions(
                        grpc.MaxCallRecvMsgSize(defaults.DefaultMaxRecvMsgSize),
                        grpc.MaxCallSendMsgSize(defaults.DefaultMaxSendMsgSize)))
                if len(copts.callOptions) > 0 {
                        gopts = append(gopts, grpc.WithDefaultCallOptions(copts.callOptions...))
                }
                if copts.defaultns != "" {
                        unary, stream := newNSInterceptors(copts.defaultns)
                        gopts = append(gopts, grpc.WithChainUnaryInterceptor(unary))
                        gopts = append(gopts, grpc.WithChainStreamInterceptor(stream))
                }

                connector := func() (*grpc.ClientConn, error) {
                        ctx, cancel := context.WithTimeout(context.Background(), copts.timeout)
                        defer cancel()
                        conn, err := grpc.DialContext(ctx, dialer.DialAddress(address), gopts...)
                        if err != nil {
                                return nil, fmt.Errorf("failed to dial %q: %w", address, err)
                        }
                        return conn, nil
                }
                conn, err := connector()
                if err != nil {
                        return nil, err
                }
                c.conn, c.connector = conn, connector
        }
        if copts.services == nil && c.conn == nil {
                return nil, fmt.Errorf("no grpc connection or services is available: %w", errdefs.ErrUnavailable)
        }

        // check namespace labels for default runtime
        if copts.defaultRuntime == "" && c.defaultns != "" {
                if label, err := c.GetLabel(context.Background(), defaults.DefaultRuntimeNSLabel); err != nil {
                        return nil, err
                } else if label != "" {
                        c.runtime = label
                }
        }

        return c, nil
}

// NewWithConn returns a new containerd client that is connected to the containerd
// instance provided by the connection
func NewWithConn(conn *grpc.ClientConn, opts ...Opt) (*Client, error) {
        var copts clientOpts
        for _, o := range opts {
                if err := o(&copts); err != nil {
                        return nil, err
                }
        }
        c := &Client{
                defaultns: copts.defaultns,
                conn:      conn,
                runtime:   defaults.DefaultRuntime,
        }

        if copts.defaultPlatform != nil {
                c.platform = copts.defaultPlatform
        } else {
                c.platform = platforms.Default()
        }

        // check namespace labels for default runtime
        if copts.defaultRuntime == "" && c.defaultns != "" {
                if label, err := c.GetLabel(context.Background(), defaults.DefaultRuntimeNSLabel); err != nil {
                        return nil, err
                } else if label != "" {
                        c.runtime = label
                }
        }

        if copts.services != nil {
                c.services = *copts.services
        }
        return c, nil
}

// Client is the client to interact with containerd and its various services
// using a uniform interface
type Client struct {
        services
        connMu    sync.Mutex
        conn      *grpc.ClientConn
        runtime   string
        defaultns string
        platform  platforms.MatchComparer
        connector func() (*grpc.ClientConn, error)
}

// Reconnect re-establishes the GRPC connection to the containerd daemon
func (c *Client) Reconnect() error {
        if c.connector == nil {
                return fmt.Errorf("unable to reconnect to containerd, no connector available: %w", errdefs.ErrUnavailable)
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        c.conn.Close()
        conn, err := c.connector()
        if err != nil {
                return err
        }
        c.conn = conn
        return nil
}

// Runtime returns the name of the runtime being used
func (c *Client) Runtime() string {
        return c.runtime
}

// IsServing returns true if the client can successfully connect to the
// containerd daemon and the healthcheck service returns the SERVING
// response.
// This call will block if a transient error is encountered during
// connection. A timeout can be set in the context to ensure it returns
// early.
func (c *Client) IsServing(ctx context.Context) (bool, error) {
        c.connMu.Lock()
        if c.conn == nil {
                c.connMu.Unlock()
                return false, fmt.Errorf("no grpc connection available: %w", errdefs.ErrUnavailable)
        }
        c.connMu.Unlock()
        r, err := c.HealthService().Check(ctx, &grpc_health_v1.HealthCheckRequest{}, grpc.WaitForReady(true))
        if err != nil {
                return false, err
        }
        return r.Status == grpc_health_v1.HealthCheckResponse_SERVING, nil
}

// Containers returns all containers created in containerd
func (c *Client) Containers(ctx context.Context, filters ...string) ([]Container, error) {
        r, err := c.ContainerService().List(ctx, filters...)
        if err != nil {
                return nil, err
        }
        var out []Container
        for _, container := range r {
                out = append(out, containerFromRecord(c, container))
        }
        return out, nil
}

// NewContainer will create a new container with the provided id.
// The id must be unique within the namespace.
func (c *Client) NewContainer(ctx context.Context, id string, opts ...NewContainerOpts) (Container, error) {
        ctx, done, err := c.WithLease(ctx)
        if err != nil {
                return nil, err
        }
        defer done(ctx)

        container := containers.Container{
                ID: id,
                Runtime: containers.RuntimeInfo{
                        Name: c.runtime,
                },
        }
        for _, o := range opts {
                if err := o(ctx, c, &container); err != nil {
                        return nil, err
                }
        }
        r, err := c.ContainerService().Create(ctx, container)
        if err != nil {
                return nil, err
        }
        return containerFromRecord(c, r), nil
}

// LoadContainer loads an existing container from metadata
func (c *Client) LoadContainer(ctx context.Context, id string) (Container, error) {
        r, err := c.ContainerService().Get(ctx, id)
        if err != nil {
                return nil, err
        }
        return containerFromRecord(c, r), nil
}

// RemoteContext is used to configure object resolutions and transfers with
// remote content stores and image providers.
type RemoteContext struct {
        // Resolver is used to resolve names to objects, fetchers, and pushers.
        // If no resolver is provided, defaults to Docker registry resolver.
        Resolver remotes.Resolver

        // PlatformMatcher is used to match the platforms for an image
        // operation and define the preference when a single match is required
        // from multiple platforms.
        PlatformMatcher platforms.MatchComparer

        // Unpack is done after an image is pulled to extract into a snapshotter.
        // It is done simultaneously for schema 2 images when they are pulled.
        // If an image is not unpacked on pull, it can be unpacked any time
        // afterwards. Unpacking is required to run an image.
        Unpack bool

        // UnpackOpts handles options to the unpack call.
        UnpackOpts []UnpackOpt

        // Snapshotter used for unpacking
        Snapshotter string

        // SnapshotterOpts are additional options to be passed to a snapshotter during pull
        SnapshotterOpts []snapshots.Opt

        // Labels to be applied to the created image
        Labels map[string]string

        // BaseHandlers are a set of handlers which get are called on dispatch.
        // These handlers always get called before any operation specific
        // handlers.
        BaseHandlers []images.Handler

        // HandlerWrapper wraps the handler which gets sent to dispatch.
        // Unlike BaseHandlers, this can run before and after the built
        // in handlers, allowing operations to run on the descriptor
        // after it has completed transferring.
        HandlerWrapper func(images.Handler) images.Handler

        // ConvertSchema1 is whether to convert Docker registry schema 1
        // manifests. If this option is false then any image which resolves
        // to schema 1 will return an error since schema 1 is not supported.
        //
        // Deprecated: use Schema 2 or OCI images.
        ConvertSchema1 bool

        // Platforms defines which platforms to handle when doing the image operation.
        // Platforms is ignored when a PlatformMatcher is set, otherwise the
        // platforms will be used to create a PlatformMatcher with no ordering
        // preference.
        Platforms []string

        // MaxConcurrentDownloads is the max concurrent content downloads for each pull.
        MaxConcurrentDownloads int

        // MaxConcurrentUploadedLayers is the max concurrent uploaded layers for each push.
        MaxConcurrentUploadedLayers int

        // AllMetadata downloads all manifests and known-configuration files
        AllMetadata bool

        // ChildLabelMap sets the labels used to reference child objects in the content
        // store. By default, all GC reference labels will be set for all fetched content.
        ChildLabelMap func(ocispec.Descriptor) []string
}

func defaultRemoteContext() *RemoteContext {
        return &RemoteContext{
                Resolver: docker.NewResolver(docker.ResolverOptions{}),
        }
}

// Fetch downloads the provided content into containerd's content store
// and returns a non-platform specific image reference
func (c *Client) Fetch(ctx context.Context, ref string, opts ...RemoteOpt) (images.Image, error) {
        fetchCtx := defaultRemoteContext()
        for _, o := range opts {
                if err := o(c, fetchCtx); err != nil {
                        return images.Image{}, err
                }
        }

        if fetchCtx.Unpack {
                return images.Image{}, fmt.Errorf("unpack on fetch not supported, try pull: %w", errdefs.ErrNotImplemented)
        }

        if fetchCtx.PlatformMatcher == nil {
                if len(fetchCtx.Platforms) == 0 {
                        fetchCtx.PlatformMatcher = platforms.All
                } else {
                        ps, err := platforms.ParseAll(fetchCtx.Platforms)
                        if err != nil {
                                return images.Image{}, err
                        }

                        fetchCtx.PlatformMatcher = platforms.Any(ps...)
                }
        }

        ctx, done, err := c.WithLease(ctx)
        if err != nil {
                return images.Image{}, err
        }
        defer done(ctx)

        img, err := c.fetch(ctx, fetchCtx, ref, 0)
        if err != nil {
                return images.Image{}, err
        }
        return c.createNewImage(ctx, img)
}

// Push uploads the provided content to a remote resource
func (c *Client) Push(ctx context.Context, ref string, desc ocispec.Descriptor, opts ...RemoteOpt) error {
        pushCtx := defaultRemoteContext()
        for _, o := range opts {
                if err := o(c, pushCtx); err != nil {
                        return err
                }
        }
        if pushCtx.PlatformMatcher == nil {
                if len(pushCtx.Platforms) > 0 {
                        ps, err := platforms.ParseAll(pushCtx.Platforms)
                        if err != nil {
                                return err
                        }
                        pushCtx.PlatformMatcher = platforms.Any(ps...)
                } else {
                        pushCtx.PlatformMatcher = platforms.All
                }
        }

        // Annotate ref with digest to push only push tag for single digest
        if !strings.Contains(ref, "@") {
                ref = ref + "@" + desc.Digest.String()
        }

        pusher, err := pushCtx.Resolver.Pusher(ctx, ref)
        if err != nil {
                return err
        }

        var wrapper func(images.Handler) images.Handler

        if len(pushCtx.BaseHandlers) > 0 {
                wrapper = func(h images.Handler) images.Handler {
                        h = images.Handlers(append(pushCtx.BaseHandlers, h)...)
                        if pushCtx.HandlerWrapper != nil {
                                h = pushCtx.HandlerWrapper(h)
                        }
                        return h
                }
        } else if pushCtx.HandlerWrapper != nil {
                wrapper = pushCtx.HandlerWrapper
        }

        var limiter *semaphore.Weighted
        if pushCtx.MaxConcurrentUploadedLayers > 0 {
                limiter = semaphore.NewWeighted(int64(pushCtx.MaxConcurrentUploadedLayers))
        }

        return remotes.PushContent(ctx, pusher, desc, c.ContentStore(), limiter, pushCtx.PlatformMatcher, wrapper)
}

// GetImage returns an existing image
func (c *Client) GetImage(ctx context.Context, ref string) (Image, error) {
        i, err := c.ImageService().Get(ctx, ref)
        if err != nil {
                return nil, err
        }
        return NewImage(c, i), nil
}

// ListImages returns all existing images
func (c *Client) ListImages(ctx context.Context, filters ...string) ([]Image, error) {
        imgs, err := c.ImageService().List(ctx, filters...)
        if err != nil {
                return nil, err
        }
        images := make([]Image, len(imgs))
        for i, img := range imgs {
                images[i] = NewImage(c, img)
        }
        return images, nil
}

// Restore restores a container from a checkpoint
func (c *Client) Restore(ctx context.Context, id string, checkpoint Image, opts ...RestoreOpts) (Container, error) {
        store := c.ContentStore()
        index, err := decodeIndex(ctx, store, checkpoint.Target())
        if err != nil {
                return nil, err
        }

        ctx, done, err := c.WithLease(ctx)
        if err != nil {
                return nil, err
        }
        defer done(ctx)

        copts := []NewContainerOpts{}
        for _, o := range opts {
                copts = append(copts, o(ctx, id, c, checkpoint, index))
        }

        ctr, err := c.NewContainer(ctx, id, copts...)
        if err != nil {
                return nil, err
        }

        return ctr, nil
}

func writeIndex(ctx context.Context, index *ocispec.Index, client *Client, ref string) (d ocispec.Descriptor, err error) {
        labels := map[string]string{}
        for i, m := range index.Manifests {
                labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i)] = m.Digest.String()
        }
        data, err := json.Marshal(index)
        if err != nil {
                return ocispec.Descriptor{}, err
        }
        return writeContent(ctx, client.ContentStore(), ocispec.MediaTypeImageIndex, ref, bytes.NewReader(data), content.WithLabels(labels))
}

func decodeIndex(ctx context.Context, store content.Provider, desc ocispec.Descriptor) (*ocispec.Index, error) {
        var index ocispec.Index
        p, err := content.ReadBlob(ctx, store, desc)
        if err != nil {
                return nil, err
        }
        if err := json.Unmarshal(p, &index); err != nil {
                return nil, err
        }

        return &index, nil
}

// GetLabel gets a label value from namespace store
// If there is no default label, an empty string returned with nil error
func (c *Client) GetLabel(ctx context.Context, label string) (string, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                if c.defaultns == "" {
                        return "", err
                }
                ns = c.defaultns
        }

        srv := c.NamespaceService()
        labels, err := srv.Labels(ctx, ns)
        if err != nil {
                return "", err
        }

        value := labels[label]
        return value, nil
}

// Subscribe to events that match one or more of the provided filters.
//
// Callers should listen on both the envelope and errs channels. If the errs
// channel returns nil or an error, the subscriber should terminate.
//
// The subscriber can stop receiving events by canceling the provided context.
// The errs channel will be closed and return a nil error.
func (c *Client) Subscribe(ctx context.Context, filters ...string) (ch <-chan *events.Envelope, errs <-chan error) {
        return c.EventService().Subscribe(ctx, filters...)
}

// Close closes the clients connection to containerd
func (c *Client) Close() error {
        c.connMu.Lock()
        defer c.connMu.Unlock()
        if c.conn != nil {
                return c.conn.Close()
        }
        return nil
}

// NamespaceService returns the underlying Namespaces Store
func (c *Client) NamespaceService() namespaces.Store {
        if c.namespaceStore != nil {
                return c.namespaceStore
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return NewNamespaceStoreFromClient(namespacesapi.NewNamespacesClient(c.conn))
}

// ContainerService returns the underlying container Store
func (c *Client) ContainerService() containers.Store {
        if c.containerStore != nil {
                return c.containerStore
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return NewRemoteContainerStore(containersapi.NewContainersClient(c.conn))
}

// ContentStore returns the underlying content Store
func (c *Client) ContentStore() content.Store {
        if c.contentStore != nil {
                return c.contentStore
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return contentproxy.NewContentStore(c.conn)
}

// SnapshotService returns the underlying snapshotter for the provided snapshotter name
func (c *Client) SnapshotService(snapshotterName string) snapshots.Snapshotter {
        snapshotterName, err := c.resolveSnapshotterName(context.Background(), snapshotterName)
        if err != nil {
                snapshotterName = defaults.DefaultSnapshotter
        }
        if c.snapshotters != nil {
                return c.snapshotters[snapshotterName]
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return snproxy.NewSnapshotter(snapshotsapi.NewSnapshotsClient(c.conn), snapshotterName)
}

// DefaultNamespace return the default namespace
func (c *Client) DefaultNamespace() string {
        return c.defaultns
}

// TaskService returns the underlying TasksClient
func (c *Client) TaskService() tasks.TasksClient {
        if c.taskService != nil {
                return c.taskService
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return tasks.NewTasksClient(c.conn)
}

// ImageService returns the underlying image Store
func (c *Client) ImageService() images.Store {
        if c.imageStore != nil {
                return c.imageStore
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return NewImageStoreFromClient(imagesapi.NewImagesClient(c.conn))
}

// DiffService returns the underlying Differ
func (c *Client) DiffService() DiffService {
        if c.diffService != nil {
                return c.diffService
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return NewDiffServiceFromClient(diffapi.NewDiffClient(c.conn))
}

// IntrospectionService returns the underlying Introspection Client
func (c *Client) IntrospectionService() introspection.Service {
        if c.introspectionService != nil {
                return c.introspectionService
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return introspectionproxy.NewIntrospectionProxy(c.conn)
}

// LeasesService returns the underlying Leases Client
func (c *Client) LeasesService() leases.Manager {
        if c.leasesService != nil {
                return c.leasesService
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return leasesproxy.NewLeaseManager(leasesapi.NewLeasesClient(c.conn))
}

// HealthService returns the underlying GRPC HealthClient
func (c *Client) HealthService() grpc_health_v1.HealthClient {
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return grpc_health_v1.NewHealthClient(c.conn)
}

// EventService returns the underlying event service
func (c *Client) EventService() EventService {
        if c.eventService != nil {
                return c.eventService
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return eventsproxy.NewRemoteEvents(c.conn)
}

// SandboxStore returns the underlying sandbox store client
func (c *Client) SandboxStore() sandbox.Store {
        if c.sandboxStore != nil {
                return c.sandboxStore
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return sandboxproxy.NewSandboxStore(sandboxsapi.NewStoreClient(c.conn))
}

// SandboxController returns the underlying sandbox controller client
func (c *Client) SandboxController(name string) sandbox.Controller {
        if c.sandboxers != nil {
                return c.sandboxers[name]
        }
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return sandboxproxy.NewSandboxController(sandboxsapi.NewControllerClient(c.conn))
}

// VersionService returns the underlying VersionClient
func (c *Client) VersionService() versionservice.VersionClient {
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return versionservice.NewVersionClient(c.conn)
}

// Conn returns the underlying RPC connection object
// Either *grpc.ClientConn or *ttrpc.Conn
func (c *Client) Conn() any {
        c.connMu.Lock()
        defer c.connMu.Unlock()
        return c.conn
}

// Version of containerd
type Version struct {
        // Version number
        Version string
        // Revision from git that was built
        Revision string
}

// Version returns the version of containerd that the client is connected to
func (c *Client) Version(ctx context.Context) (Version, error) {
        c.connMu.Lock()
        if c.conn == nil {
                c.connMu.Unlock()
                return Version{}, fmt.Errorf("no grpc connection available: %w", errdefs.ErrUnavailable)
        }
        c.connMu.Unlock()
        response, err := c.VersionService().Version(ctx, &ptypes.Empty{})
        if err != nil {
                return Version{}, err
        }
        return Version{
                Version:  response.Version,
                Revision: response.Revision,
        }, nil
}

// ServerInfo represents the introspected server information
type ServerInfo struct {
        UUID string
}

// Server returns server information from the introspection service
func (c *Client) Server(ctx context.Context) (ServerInfo, error) {
        c.connMu.Lock()
        if c.conn == nil {
                c.connMu.Unlock()
                return ServerInfo{}, fmt.Errorf("no grpc connection available: %w", errdefs.ErrUnavailable)
        }
        c.connMu.Unlock()

        response, err := c.IntrospectionService().Server(ctx)
        if err != nil {
                return ServerInfo{}, err
        }
        return ServerInfo{
                UUID: response.UUID,
        }, nil
}

func (c *Client) resolveSnapshotterName(ctx context.Context, name string) (string, error) {
        if name == "" {
                label, err := c.GetLabel(ctx, defaults.DefaultSnapshotterNSLabel)
                if err != nil {
                        return "", err
                }

                if label != "" {
                        name = label
                } else {
                        name = defaults.DefaultSnapshotter
                }
        }

        return name, nil
}

func (c *Client) getSnapshotter(ctx context.Context, name string) (snapshots.Snapshotter, error) {
        name, err := c.resolveSnapshotterName(ctx, name)
        if err != nil {
                return nil, err
        }

        s := c.SnapshotService(name)
        if s == nil {
                return nil, fmt.Errorf("snapshotter %s was not found: %w", name, errdefs.ErrNotFound)
        }

        return s, nil
}

// GetSnapshotterSupportedPlatforms returns a platform matchers which represents the
// supported platforms for the given snapshotters
func (c *Client) GetSnapshotterSupportedPlatforms(ctx context.Context, snapshotterName string) (platforms.MatchComparer, error) {
        filters := []string{fmt.Sprintf("type==%s, id==%s", plugins.SnapshotPlugin, snapshotterName)}
        in := c.IntrospectionService()

        resp, err := in.Plugins(ctx, filters...)
        if err != nil {
                return nil, err
        }

        if len(resp.Plugins) <= 0 {
                return nil, fmt.Errorf("inspection service could not find snapshotter %s plugin", snapshotterName)
        }

        sn := resp.Plugins[0]
        snPlatforms := toPlatforms(sn.Platforms)
        return platforms.Any(snPlatforms...), nil
}

func toPlatforms(pt []*apitypes.Platform) []ocispec.Platform {
        platforms := make([]ocispec.Platform, len(pt))
        for i, p := range pt {
                platforms[i] = ocispec.Platform{
                        Architecture: p.Architecture,
                        OS:           p.OS,
                        Variant:      p.Variant,
                }
        }
        return platforms
}

// GetSnapshotterCapabilities returns the capabilities of a snapshotter.
func (c *Client) GetSnapshotterCapabilities(ctx context.Context, snapshotterName string) ([]string, error) {
        filters := []string{fmt.Sprintf("type==%s, id==%s", plugins.SnapshotPlugin, snapshotterName)}
        in := c.IntrospectionService()

        resp, err := in.Plugins(ctx, filters...)
        if err != nil {
                return nil, err
        }

        if len(resp.Plugins) <= 0 {
                return nil, fmt.Errorf("inspection service could not find snapshotter %s plugin", snapshotterName)
        }

        sn := resp.Plugins[0]
        return sn.Capabilities, nil
}

type RuntimeVersion struct {
        Version  string
        Revision string
}

type RuntimeInfo struct {
        Name        string
        Version     RuntimeVersion
        Options     interface{}
        Features    interface{}
        Annotations map[string]string
}

func (c *Client) RuntimeInfo(ctx context.Context, runtimePath string, runtimeOptions interface{}) (*RuntimeInfo, error) {
        rt := c.runtime
        if runtimePath != "" {
                rt = runtimePath
        }
        rr := &apitypes.RuntimeRequest{
                RuntimePath: rt,
        }
        var err error
        if runtimeOptions != nil {
                rr.Options, err = protobuf.MarshalAnyToProto(runtimeOptions)
                if err != nil {
                        return nil, fmt.Errorf("failed to marshal %T: %w", runtimeOptions, err)
                }
        }

        s := c.IntrospectionService()

        resp, err := s.PluginInfo(ctx, string(plugins.RuntimePluginV2), "task", rr)
        if err != nil {
                return nil, err
        }

        var info apitypes.RuntimeInfo
        if err := typeurl.UnmarshalTo(resp.Extra, &info); err != nil {
                return nil, fmt.Errorf("failed to get runtime info from plugin info: %w", err)
        }

        var result RuntimeInfo
        result.Name = info.Name
        if info.Version != nil {
                result.Version.Version = info.Version.Version
                result.Version.Revision = info.Version.Revision
        }
        if info.Options != nil {
                result.Options, err = typeurl.UnmarshalAny(info.Options)
                if err != nil {
                        return nil, fmt.Errorf("failed to unmarshal RuntimeInfo.Options (%T): %w", info.Options, err)
                }
        }
        if info.Features != nil {
                result.Features, err = typeurl.UnmarshalAny(info.Features)
                if err != nil {
                        return nil, fmt.Errorf("failed to unmarshal RuntimeInfo.Features (%T): %w", info.Features, err)
                }
        }
        result.Annotations = info.Annotations
        return &result, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "time"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/platforms"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "google.golang.org/grpc"
)

type clientOpts struct {
        defaultns       string
        defaultRuntime  string
        defaultPlatform platforms.MatchComparer
        services        *services
        dialOptions     []grpc.DialOption
        callOptions     []grpc.CallOption
        timeout         time.Duration
}

// Opt allows callers to set options on the containerd client
type Opt func(c *clientOpts) error

// WithDefaultNamespace sets the default namespace on the client
//
// Any operation that does not have a namespace set on the context will
// be provided the default namespace
func WithDefaultNamespace(ns string) Opt {
        return func(c *clientOpts) error {
                c.defaultns = ns
                return nil
        }
}

// WithDefaultRuntime sets the default runtime on the client
func WithDefaultRuntime(rt string) Opt {
        return func(c *clientOpts) error {
                c.defaultRuntime = rt
                return nil
        }
}

// WithDefaultPlatform sets the default platform matcher on the client
func WithDefaultPlatform(platform platforms.MatchComparer) Opt {
        return func(c *clientOpts) error {
                c.defaultPlatform = platform
                return nil
        }
}

// WithDialOpts allows grpc.DialOptions to be set on the connection
func WithDialOpts(opts []grpc.DialOption) Opt {
        return func(c *clientOpts) error {
                c.dialOptions = opts
                return nil
        }
}

// WithCallOpts allows grpc.CallOptions to be set on the connection
func WithCallOpts(opts []grpc.CallOption) Opt {
        return func(c *clientOpts) error {
                c.callOptions = opts
                return nil
        }
}

// WithServices sets services used by the client.
func WithServices(opts ...ServicesOpt) Opt {
        return func(c *clientOpts) error {
                c.services = &services{}
                for _, o := range opts {
                        o(c.services)
                }
                return nil
        }
}

// WithTimeout sets the connection timeout for the client
func WithTimeout(d time.Duration) Opt {
        return func(c *clientOpts) error {
                c.timeout = d
                return nil
        }
}

// RemoteOpt allows the caller to set distribution options for a remote
type RemoteOpt func(*Client, *RemoteContext) error

// WithPlatform allows the caller to specify a platform to retrieve
// content for
func WithPlatform(platform string) RemoteOpt {
        if platform == "" {
                platform = platforms.DefaultString()
        }
        return func(_ *Client, c *RemoteContext) error {
                for _, p := range c.Platforms {
                        if p == platform {
                                return nil
                        }
                }

                c.Platforms = append(c.Platforms, platform)
                return nil
        }
}

// WithPlatformMatcher specifies the matcher to use for
// determining which platforms to pull content for.
// This value supersedes anything set with `WithPlatform`.
func WithPlatformMatcher(m platforms.MatchComparer) RemoteOpt {
        return func(_ *Client, c *RemoteContext) error {
                c.PlatformMatcher = m
                return nil
        }
}

// WithPullUnpack is used to unpack an image after pull. This
// uses the snapshotter, content store, and diff service
// configured for the client.
func WithPullUnpack(_ *Client, c *RemoteContext) error {
        c.Unpack = true
        return nil
}

// WithUnpackOpts is used to add unpack options to the unpacker.
func WithUnpackOpts(opts []UnpackOpt) RemoteOpt {
        return func(_ *Client, c *RemoteContext) error {
                c.UnpackOpts = append(c.UnpackOpts, opts...)
                return nil
        }
}

// WithPullSnapshotter specifies snapshotter name used for unpacking.
func WithPullSnapshotter(snapshotterName string, opts ...snapshots.Opt) RemoteOpt {
        return func(_ *Client, c *RemoteContext) error {
                c.Snapshotter = snapshotterName
                c.SnapshotterOpts = opts
                return nil
        }
}

// WithPullLabel sets a label to be associated with a pulled reference
func WithPullLabel(key, value string) RemoteOpt {
        return func(_ *Client, rc *RemoteContext) error {
                if rc.Labels == nil {
                        rc.Labels = make(map[string]string)
                }

                rc.Labels[key] = value
                return nil
        }
}

// WithPullLabels associates a set of labels to a pulled reference
func WithPullLabels(labels map[string]string) RemoteOpt {
        return func(_ *Client, rc *RemoteContext) error {
                if rc.Labels == nil {
                        rc.Labels = make(map[string]string)
                }

                for k, v := range labels {
                        rc.Labels[k] = v
                }
                return nil
        }
}

// WithChildLabelMap sets the map function used to define the labels set
// on referenced child content in the content store. This can be used
// to overwrite the default GC labels or filter which labels get set
// for content.
// The default is `images.ChildGCLabels`.
func WithChildLabelMap(fn func(ocispec.Descriptor) []string) RemoteOpt {
        return func(_ *Client, c *RemoteContext) error {
                c.ChildLabelMap = fn
                return nil
        }
}

// WithSchema1Conversion is used to convert Docker registry schema 1
// manifests to oci manifests on pull. Without this option schema 1
// manifests will return a not supported error.
//
// Deprecated: use Schema 2 or OCI images.
func WithSchema1Conversion(client *Client, c *RemoteContext) error {
        c.ConvertSchema1 = true
        return nil
}

// WithResolver specifies the resolver to use.
func WithResolver(resolver remotes.Resolver) RemoteOpt {
        return func(client *Client, c *RemoteContext) error {
                c.Resolver = resolver
                return nil
        }
}

// WithImageHandler adds a base handler to be called on dispatch.
func WithImageHandler(h images.Handler) RemoteOpt {
        return func(client *Client, c *RemoteContext) error {
                c.BaseHandlers = append(c.BaseHandlers, h)
                return nil
        }
}

// WithImageHandlerWrapper wraps the handlers to be called on dispatch.
func WithImageHandlerWrapper(w func(images.Handler) images.Handler) RemoteOpt {
        return func(client *Client, c *RemoteContext) error {
                c.HandlerWrapper = w
                return nil
        }
}

// WithMaxConcurrentDownloads sets max concurrent download limit.
func WithMaxConcurrentDownloads(max int) RemoteOpt {
        return func(client *Client, c *RemoteContext) error {
                c.MaxConcurrentDownloads = max
                return nil
        }
}

// WithMaxConcurrentUploadedLayers sets max concurrent uploaded layer limit.
func WithMaxConcurrentUploadedLayers(max int) RemoteOpt {
        return func(client *Client, c *RemoteContext) error {
                c.MaxConcurrentUploadedLayers = max
                return nil
        }
}

// WithAllMetadata downloads all manifests and known-configuration files
func WithAllMetadata() RemoteOpt {
        return func(_ *Client, c *RemoteContext) error {
                c.AllMetadata = true
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "encoding/json"
        "fmt"
        "os"
        "path/filepath"
        "strings"

        "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/api/types/runc/options"
        tasktypes "github.com/containerd/containerd/api/types/task"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/fifo"
        "github.com/containerd/typeurl/v2"
        ver "github.com/opencontainers/image-spec/specs-go"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/opencontainers/selinux/go-selinux/label"
)

const (
        checkpointRuntimeNameLabel     = "io.containerd.checkpoint.runtime"
        checkpointSnapshotterNameLabel = "io.containerd.checkpoint.snapshotter"
)

// Container is a metadata object for container resources and task creation
type Container interface {
        // ID identifies the container
        ID() string
        // Info returns the underlying container record type
        Info(context.Context, ...InfoOpts) (containers.Container, error)
        // Delete removes the container
        Delete(context.Context, ...DeleteOpts) error
        // NewTask creates a new task based on the container metadata
        NewTask(context.Context, cio.Creator, ...NewTaskOpts) (Task, error)
        // Spec returns the OCI runtime specification
        Spec(context.Context) (*oci.Spec, error)
        // Task returns the current task for the container
        //
        // If cio.Load options are passed the client will Load the IO for the running
        // task.
        //
        // If cio.Attach options are passed the client will reattach to the IO for the running
        // task.
        //
        // If no task exists for the container a NotFound error is returned
        //
        // Clients must make sure that only one reader is attached to the task and consuming
        // the output from the task's fifos
        Task(context.Context, cio.Attach) (Task, error)
        // Image returns the image that the container is based on
        Image(context.Context) (Image, error)
        // Labels returns the labels set on the container
        Labels(context.Context) (map[string]string, error)
        // SetLabels sets the provided labels for the container and returns the final label set
        SetLabels(context.Context, map[string]string) (map[string]string, error)
        // Extensions returns the extensions set on the container
        Extensions(context.Context) (map[string]typeurl.Any, error)
        // Update a container
        Update(context.Context, ...UpdateContainerOpts) error
        // Checkpoint creates a checkpoint image of the current container
        Checkpoint(context.Context, string, ...CheckpointOpts) (Image, error)
}

func containerFromRecord(client *Client, c containers.Container) *container {
        return &container{
                client:   client,
                id:       c.ID,
                metadata: c,
        }
}

var _ = (Container)(&container{})

type container struct {
        client   *Client
        id       string
        metadata containers.Container
}

// ID returns the container's unique id
func (c *container) ID() string {
        return c.id
}

func (c *container) Info(ctx context.Context, opts ...InfoOpts) (containers.Container, error) {
        i := &InfoConfig{
                // default to refreshing the container's local metadata
                Refresh: true,
        }
        for _, o := range opts {
                o(i)
        }
        if i.Refresh {
                metadata, err := c.get(ctx)
                if err != nil {
                        return c.metadata, err
                }
                c.metadata = metadata
        }
        return c.metadata, nil
}

func (c *container) Extensions(ctx context.Context) (map[string]typeurl.Any, error) {
        r, err := c.get(ctx)
        if err != nil {
                return nil, err
        }
        return r.Extensions, nil
}

func (c *container) Labels(ctx context.Context) (map[string]string, error) {
        r, err := c.get(ctx)
        if err != nil {
                return nil, err
        }
        return r.Labels, nil
}

func (c *container) SetLabels(ctx context.Context, labels map[string]string) (map[string]string, error) {
        container := containers.Container{
                ID:     c.id,
                Labels: labels,
        }

        var paths []string
        // mask off paths so we only muck with the labels encountered in labels.
        // Labels not in the passed in argument will be left alone.
        for k := range labels {
                paths = append(paths, strings.Join([]string{"labels", k}, "."))
        }

        r, err := c.client.ContainerService().Update(ctx, container, paths...)
        if err != nil {
                return nil, err
        }
        return r.Labels, nil
}

// Spec returns the current OCI specification for the container
func (c *container) Spec(ctx context.Context) (*oci.Spec, error) {
        r, err := c.get(ctx)
        if err != nil {
                return nil, err
        }
        var s oci.Spec
        if err := json.Unmarshal(r.Spec.GetValue(), &s); err != nil {
                return nil, err
        }
        return &s, nil
}

// Delete deletes an existing container
// an error is returned if the container has running tasks
func (c *container) Delete(ctx context.Context, opts ...DeleteOpts) error {
        if _, err := c.loadTask(ctx, nil); err == nil {
                return fmt.Errorf("cannot delete running task %v: %w", c.id, errdefs.ErrFailedPrecondition)
        }
        r, err := c.get(ctx)
        if err != nil {
                return err
        }
        for _, o := range opts {
                if err := o(ctx, c.client, r); err != nil {
                        return err
                }
        }
        return c.client.ContainerService().Delete(ctx, c.id)
}

func (c *container) Task(ctx context.Context, attach cio.Attach) (Task, error) {
        return c.loadTask(ctx, attach)
}

// Image returns the image that the container is based on
func (c *container) Image(ctx context.Context) (Image, error) {
        r, err := c.get(ctx)
        if err != nil {
                return nil, err
        }
        if r.Image == "" {
                return nil, fmt.Errorf("container not created from an image: %w", errdefs.ErrNotFound)
        }
        i, err := c.client.ImageService().Get(ctx, r.Image)
        if err != nil {
                return nil, fmt.Errorf("failed to get image %s for container: %w", r.Image, err)
        }
        return NewImage(c.client, i), nil
}

func (c *container) NewTask(ctx context.Context, ioCreate cio.Creator, opts ...NewTaskOpts) (_ Task, err error) {
        i, err := ioCreate(c.id)
        if err != nil {
                return nil, err
        }
        defer func() {
                if err != nil && i != nil {
                        i.Cancel()
                        i.Close()
                }
        }()
        cfg := i.Config()
        request := &tasks.CreateTaskRequest{
                ContainerID: c.id,
                Terminal:    cfg.Terminal,
                Stdin:       cfg.Stdin,
                Stdout:      cfg.Stdout,
                Stderr:      cfg.Stderr,
        }
        r, err := c.get(ctx)
        if err != nil {
                return nil, err
        }
        if r.SnapshotKey != "" {
                if r.Snapshotter == "" {
                        return nil, fmt.Errorf("unable to resolve rootfs mounts without snapshotter on container: %w", errdefs.ErrInvalidArgument)
                }

                // get the rootfs from the snapshotter and add it to the request
                s, err := c.client.getSnapshotter(ctx, r.Snapshotter)
                if err != nil {
                        return nil, err
                }
                mounts, err := s.Mounts(ctx, r.SnapshotKey)
                if err != nil {
                        return nil, err
                }
                spec, err := c.Spec(ctx)
                if err != nil {
                        return nil, err
                }
                for _, m := range mounts {
                        if spec.Linux != nil && spec.Linux.MountLabel != "" {
                                if ml := label.FormatMountLabel("", spec.Linux.MountLabel); ml != "" {
                                        m.Options = append(m.Options, ml)
                                }
                        }
                        request.Rootfs = append(request.Rootfs, &types.Mount{
                                Type:    m.Type,
                                Source:  m.Source,
                                Target:  m.Target,
                                Options: m.Options,
                        })
                }
        }
        info := TaskInfo{
                runtime: r.Runtime.Name,
        }
        for _, o := range opts {
                if err := o(ctx, c.client, &info); err != nil {
                        return nil, err
                }
        }
        for _, m := range info.RootFS {
                request.Rootfs = append(request.Rootfs, &types.Mount{
                        Type:    m.Type,
                        Source:  m.Source,
                        Target:  m.Target,
                        Options: m.Options,
                })
        }
        request.RuntimePath = info.RuntimePath
        if info.Options != nil {
                o, err := typeurl.MarshalAny(info.Options)
                if err != nil {
                        return nil, err
                }
                request.Options = protobuf.FromAny(o)
        }
        t := &task{
                client: c.client,
                io:     i,
                id:     c.id,
                c:      c,
        }
        if info.Checkpoint != nil {
                request.Checkpoint = info.Checkpoint
        }
        response, err := c.client.TaskService().Create(ctx, request)
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        t.pid = response.Pid
        return t, nil
}

func (c *container) Update(ctx context.Context, opts ...UpdateContainerOpts) error {
        // fetch the current container config before updating it
        r, err := c.get(ctx)
        if err != nil {
                return err
        }
        for _, o := range opts {
                if err := o(ctx, c.client, &r); err != nil {
                        return err
                }
        }
        if _, err := c.client.ContainerService().Update(ctx, r); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (c *container) Checkpoint(ctx context.Context, ref string, opts ...CheckpointOpts) (Image, error) {
        index := &ocispec.Index{
                Versioned: ver.Versioned{
                        SchemaVersion: 2,
                },
                Annotations: make(map[string]string),
        }
        copts := &options.CheckpointOptions{
                Exit:                false,
                OpenTcp:             false,
                ExternalUnixSockets: false,
                Terminal:            false,
                FileLocks:           true,
                EmptyNamespaces:     nil,
        }
        info, err := c.Info(ctx)
        if err != nil {
                return nil, err
        }

        img, err := c.Image(ctx)
        if err != nil {
                return nil, err
        }

        ctx, done, err := c.client.WithLease(ctx)
        if err != nil {
                return nil, err
        }
        defer done(ctx)

        // add image name to manifest
        index.Annotations[ocispec.AnnotationRefName] = img.Name()
        // add runtime info to index
        index.Annotations[checkpointRuntimeNameLabel] = info.Runtime.Name
        // add snapshotter info to index
        index.Annotations[checkpointSnapshotterNameLabel] = info.Snapshotter

        // process remaining opts
        for _, o := range opts {
                if err := o(ctx, c.client, &info, index, copts); err != nil {
                        err = errdefs.FromGRPC(err)
                        if !errdefs.IsAlreadyExists(err) {
                                return nil, err
                        }
                }
        }

        desc, err := writeIndex(ctx, index, c.client, c.ID()+"index")
        if err != nil {
                return nil, err
        }
        i := images.Image{
                Name:   ref,
                Target: desc,
        }
        checkpoint, err := c.client.ImageService().Create(ctx, i)
        if err != nil {
                return nil, err
        }

        return NewImage(c.client, checkpoint), nil
}

func (c *container) loadTask(ctx context.Context, ioAttach cio.Attach) (Task, error) {
        response, err := c.client.TaskService().Get(ctx, &tasks.GetRequest{
                ContainerID: c.id,
        })
        if err != nil {
                err = errdefs.FromGRPC(err)
                if errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("no running task found: %w", err)
                }
                return nil, err
        }
        var i cio.IO
        if ioAttach != nil && response.Process.Status != tasktypes.Status_UNKNOWN {
                // Do not attach IO for task in unknown state, because there
                // are no fifo paths anyway.
                if i, err = attachExistingIO(response, ioAttach); err != nil {
                        return nil, err
                }
        }
        t := &task{
                client: c.client,
                io:     i,
                id:     response.Process.ID,
                pid:    response.Process.Pid,
                c:      c,
        }
        return t, nil
}

func (c *container) get(ctx context.Context) (containers.Container, error) {
        return c.client.ContainerService().Get(ctx, c.id)
}

// get the existing fifo paths from the task information stored by the daemon
func attachExistingIO(response *tasks.GetResponse, ioAttach cio.Attach) (cio.IO, error) {
        fifoSet := loadFifos(response)
        return ioAttach(fifoSet)
}

// loadFifos loads the containers fifos
func loadFifos(response *tasks.GetResponse) *cio.FIFOSet {
        fifos := []string{
                response.Process.Stdin,
                response.Process.Stdout,
                response.Process.Stderr,
        }
        closer := func() error {
                var (
                        err  error
                        dirs = map[string]struct{}{}
                )
                for _, f := range fifos {
                        if isFifo, _ := fifo.IsFifo(f); isFifo {
                                if rerr := os.Remove(f); err == nil {
                                        err = rerr
                                }
                                dirs[filepath.Dir(f)] = struct{}{}
                        }
                }
                for dir := range dirs {
                        // we ignore errors here because we don't
                        // want to remove the directory if it isn't
                        // empty
                        _ = os.Remove(dir)
                }
                return err
        }

        return cio.NewFIFOSet(cio.Config{
                Stdin:    response.Process.Stdin,
                Stdout:   response.Process.Stdout,
                Stderr:   response.Process.Stderr,
                Terminal: response.Process.Terminal,
        }, closer)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "runtime"

        tasks "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/api/types/runc/options"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/rootfs"
        "github.com/containerd/platforms"
        "github.com/opencontainers/go-digest"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
)

// ErrMediaTypeNotFound returns an error when a media type in the manifest is unknown
var ErrMediaTypeNotFound = errors.New("media type not found")

// CheckpointOpts are options to manage the checkpoint operation
type CheckpointOpts func(context.Context, *Client, *containers.Container, *imagespec.Index, *options.CheckpointOptions) error

// WithCheckpointImage includes the container image in the checkpoint
func WithCheckpointImage(ctx context.Context, client *Client, c *containers.Container, index *imagespec.Index, copts *options.CheckpointOptions) error {
        ir, err := client.ImageService().Get(ctx, c.Image)
        if err != nil {
                return err
        }
        index.Manifests = append(index.Manifests, ir.Target)
        return nil
}

// WithCheckpointTask includes the running task
func WithCheckpointTask(ctx context.Context, client *Client, c *containers.Container, index *imagespec.Index, copts *options.CheckpointOptions) error {
        opt, err := protobuf.MarshalAnyToProto(copts)
        if err != nil {
                return nil
        }
        task, err := client.TaskService().Checkpoint(ctx, &tasks.CheckpointTaskRequest{
                ContainerID: c.ID,
                Options:     opt,
        })
        if err != nil {
                return err
        }
        for _, d := range task.Descriptors {
                platformSpec := platforms.DefaultSpec()
                index.Manifests = append(index.Manifests, imagespec.Descriptor{
                        MediaType:   d.MediaType,
                        Size:        d.Size,
                        Digest:      digest.Digest(d.Digest),
                        Platform:    &platformSpec,
                        Annotations: d.Annotations,
                })
        }
        // save copts
        data, err := proto.Marshal(opt)
        if err != nil {
                return err
        }
        r := bytes.NewReader(data)
        desc, err := writeContent(ctx, client.ContentStore(), images.MediaTypeContainerd1CheckpointOptions, c.ID+"-checkpoint-options", r)
        if err != nil {
                return err
        }
        desc.Platform = &imagespec.Platform{
                OS:           runtime.GOOS,
                Architecture: runtime.GOARCH,
        }
        index.Manifests = append(index.Manifests, desc)
        return nil
}

// WithCheckpointRuntime includes the container runtime info
func WithCheckpointRuntime(ctx context.Context, client *Client, c *containers.Container, index *imagespec.Index, copts *options.CheckpointOptions) error {
        if c.Runtime.Options != nil && c.Runtime.Options.GetValue() != nil {
                opt := protobuf.FromAny(c.Runtime.Options)
                data, err := proto.Marshal(opt)
                if err != nil {
                        return err
                }
                r := bytes.NewReader(data)
                desc, err := writeContent(ctx, client.ContentStore(), images.MediaTypeContainerd1CheckpointRuntimeOptions, c.ID+"-runtime-options", r)
                if err != nil {
                        return err
                }
                desc.Platform = &imagespec.Platform{
                        OS:           runtime.GOOS,
                        Architecture: runtime.GOARCH,
                }
                index.Manifests = append(index.Manifests, desc)
        }
        return nil
}

// WithCheckpointRW includes the rw in the checkpoint
func WithCheckpointRW(ctx context.Context, client *Client, c *containers.Container, index *imagespec.Index, copts *options.CheckpointOptions) error {
        diffOpts := []diff.Opt{
                diff.WithReference(fmt.Sprintf("checkpoint-rw-%s", c.SnapshotKey)),
        }
        rw, err := rootfs.CreateDiff(ctx,
                c.SnapshotKey,
                client.SnapshotService(c.Snapshotter),
                client.DiffService(),
                diffOpts...,
        )
        if err != nil {
                return err

        }
        rw.Platform = &imagespec.Platform{
                OS:           runtime.GOOS,
                Architecture: runtime.GOARCH,
        }
        index.Manifests = append(index.Manifests, rw)
        return nil
}

// WithCheckpointTaskExit causes the task to exit after checkpoint
func WithCheckpointTaskExit(ctx context.Context, client *Client, c *containers.Container, index *imagespec.Index, copts *options.CheckpointOptions) error {
        copts.Exit = true
        return nil
}

// GetIndexByMediaType returns the index in a manifest for the specified media type
func GetIndexByMediaType(index *imagespec.Index, mt string) (*imagespec.Descriptor, error) {
        for _, d := range index.Manifests {
                if d.MediaType == mt {
                        return &d, nil
                }
        }
        return nil, ErrMediaTypeNotFound
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
        "github.com/opencontainers/image-spec/identity"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"
)

// DeleteOpts allows the caller to set options for the deletion of a container
type DeleteOpts func(ctx context.Context, client *Client, c containers.Container) error

// NewContainerOpts allows the caller to set additional options when creating a container
type NewContainerOpts func(ctx context.Context, client *Client, c *containers.Container) error

// UpdateContainerOpts allows the caller to set additional options when updating a container
type UpdateContainerOpts func(ctx context.Context, client *Client, c *containers.Container) error

// InfoOpts controls how container metadata is fetched and returned
type InfoOpts func(*InfoConfig)

// InfoConfig specifies how container metadata is fetched
type InfoConfig struct {
        // Refresh will to a fetch of the latest container metadata
        Refresh bool
}

// WithRuntime allows a user to specify the runtime name and additional options that should
// be used to create tasks for the container
func WithRuntime(name string, options interface{}) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                var (
                        opts typeurl.Any
                        err  error
                )
                if options != nil {
                        opts, err = typeurl.MarshalAny(options)
                        if err != nil {
                                return err
                        }
                }
                c.Runtime = containers.RuntimeInfo{
                        Name:    name,
                        Options: opts,
                }
                return nil
        }
}

// WithSandbox joins the container to a container group (aka sandbox) from the given ID
// Note: shim runtime must support sandboxes environments.
func WithSandbox(sandboxID string) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                c.SandboxID = sandboxID
                return nil
        }
}

// WithImage sets the provided image as the base for the container
func WithImage(i Image) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                c.Image = i.Name()
                return nil
        }
}

// WithImageName allows setting the image name as the base for the container
func WithImageName(n string) NewContainerOpts {
        return func(ctx context.Context, _ *Client, c *containers.Container) error {
                c.Image = n
                return nil
        }
}

// WithContainerLabels sets the provided labels to the container.
// The existing labels are cleared.
// Use WithAdditionalContainerLabels to preserve the existing labels.
func WithContainerLabels(labels map[string]string) NewContainerOpts {
        return func(_ context.Context, _ *Client, c *containers.Container) error {
                c.Labels = labels
                return nil
        }
}

// WithImageConfigLabels sets the image config labels on the container.
// The existing labels are cleared as this is expected to be the first
// operation in setting up a container's labels. Use WithAdditionalContainerLabels
// to add/overwrite the existing image config labels.
func WithImageConfigLabels(image Image) NewContainerOpts {
        return func(ctx context.Context, _ *Client, c *containers.Container) error {
                ic, err := image.Config(ctx)
                if err != nil {
                        return err
                }
                if !images.IsConfigType(ic.MediaType) {
                        return fmt.Errorf("unknown image config media type %s", ic.MediaType)
                }

                var (
                        ociimage v1.Image
                        config   v1.ImageConfig
                )
                p, err := content.ReadBlob(ctx, image.ContentStore(), ic)
                if err != nil {
                        return err
                }

                if err = json.Unmarshal(p, &ociimage); err != nil {
                        return err
                }
                config = ociimage.Config

                c.Labels = config.Labels
                return nil
        }
}

// WithAdditionalContainerLabels adds the provided labels to the container
// The existing labels are preserved as long as they do not conflict with the added labels.
func WithAdditionalContainerLabels(labels map[string]string) NewContainerOpts {
        return func(_ context.Context, _ *Client, c *containers.Container) error {
                if c.Labels == nil {
                        c.Labels = labels
                        return nil
                }
                for k, v := range labels {
                        c.Labels[k] = v
                }
                return nil
        }
}

// WithImageStopSignal sets a well-known containerd label (StopSignalLabel)
// on the container for storing the stop signal specified in the OCI image
// config
func WithImageStopSignal(image Image, defaultSignal string) NewContainerOpts {
        return func(ctx context.Context, _ *Client, c *containers.Container) error {
                if c.Labels == nil {
                        c.Labels = make(map[string]string)
                }
                stopSignal, err := GetOCIStopSignal(ctx, image, defaultSignal)
                if err != nil {
                        return err
                }
                c.Labels[StopSignalLabel] = stopSignal
                return nil
        }
}

// WithSnapshotter sets the provided snapshotter for use by the container
//
// This option must appear before other snapshotter options to have an effect.
func WithSnapshotter(name string) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                c.Snapshotter = name
                return nil
        }
}

// WithSnapshot uses an existing root filesystem for the container
func WithSnapshot(id string) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                // check that the snapshot exists, if not, fail on creation
                var err error
                c.Snapshotter, err = client.resolveSnapshotterName(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                s, err := client.getSnapshotter(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                if _, err := s.Mounts(ctx, id); err != nil {
                        return err
                }
                c.SnapshotKey = id
                return nil
        }
}

// WithSnapshotCleanup deletes the rootfs snapshot allocated for the container
func WithSnapshotCleanup(ctx context.Context, client *Client, c containers.Container) error {
        if c.SnapshotKey != "" {
                if c.Snapshotter == "" {
                        return fmt.Errorf("container.Snapshotter must be set to cleanup rootfs snapshot: %w", errdefs.ErrInvalidArgument)
                }
                s, err := client.getSnapshotter(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                if err := s.Remove(ctx, c.SnapshotKey); err != nil && !errdefs.IsNotFound(err) {
                        return err
                }
        }
        return nil
}

// WithNewSnapshot allocates a new snapshot to be used by the container as the
// root filesystem in read-write mode
func WithNewSnapshot(id string, i Image, opts ...snapshots.Opt) NewContainerOpts {
        return withNewSnapshot(id, i, false, opts...)
}

// WithNewSnapshotView allocates a new snapshot to be used by the container as the
// root filesystem in read-only mode
func WithNewSnapshotView(id string, i Image, opts ...snapshots.Opt) NewContainerOpts {
        return withNewSnapshot(id, i, true, opts...)
}

func withNewSnapshot(id string, i Image, readonly bool, opts ...snapshots.Opt) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                diffIDs, err := i.RootFS(ctx)
                if err != nil {
                        return err
                }

                parent := identity.ChainID(diffIDs).String()
                c.Snapshotter, err = client.resolveSnapshotterName(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                s, err := client.getSnapshotter(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                parent, err = resolveSnapshotOptions(ctx, client, c.Snapshotter, s, parent, opts...)
                if err != nil {
                        return err
                }

                if readonly {
                        _, err = s.View(ctx, id, parent, opts...)
                } else {
                        _, err = s.Prepare(ctx, id, parent, opts...)
                }
                if err != nil {
                        return err
                }
                c.SnapshotKey = id
                c.Image = i.Name()
                return nil
        }
}

// WithContainerExtension appends extension data to the container object.
// Use this to decorate the container object with additional data for the client
// integration.
//
// Make sure to register the type of `extension` in the typeurl package via
// `typeurl.Register` or container creation may fail.
func WithContainerExtension(name string, extension interface{}) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                if name == "" {
                        return fmt.Errorf("extension key must not be zero-length: %w", errdefs.ErrInvalidArgument)
                }

                ext, err := typeurl.MarshalAny(extension)
                if err != nil {
                        if errors.Is(err, typeurl.ErrNotFound) {
                                return fmt.Errorf("extension %q is not registered with the typeurl package, see `typeurl.Register`: %w", name, err)
                        }
                        return fmt.Errorf("error marshalling extension: %w", err)
                }

                if c.Extensions == nil {
                        c.Extensions = make(map[string]typeurl.Any)
                }
                c.Extensions[name] = ext
                return nil
        }
}

// WithNewSpec generates a new spec for a new container
func WithNewSpec(opts ...oci.SpecOpts) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                if _, ok := namespaces.Namespace(ctx); !ok {
                        ctx = namespaces.WithNamespace(ctx, client.DefaultNamespace())
                }
                s, err := oci.GenerateSpec(ctx, client, c, opts...)
                if err != nil {
                        return err
                }
                c.Spec, err = typeurl.MarshalAny(s)
                return err
        }
}

// WithSpec sets the provided spec on the container
func WithSpec(s *oci.Spec, opts ...oci.SpecOpts) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                if err := oci.ApplyOpts(ctx, client, c, s, opts...); err != nil {
                        return err
                }

                var err error
                c.Spec, err = protobuf.MarshalAnyToProto(s)
                return err
        }
}

// WithoutRefreshedMetadata will use the current metadata attached to the container object
func WithoutRefreshedMetadata(i *InfoConfig) {
        i.Refresh = false
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "syscall"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/errdefs"
        "github.com/opencontainers/image-spec/identity"
)

// WithRemappedSnapshot creates a new snapshot and remaps the uid/gid for the
// filesystem to be used by a container with user namespaces
func WithRemappedSnapshot(id string, i Image, uid, gid uint32) NewContainerOpts {
        return withRemappedSnapshotBase(id, i, uid, gid, false)
}

// WithRemappedSnapshotView is similar to WithRemappedSnapshot but rootfs is mounted as read-only.
func WithRemappedSnapshotView(id string, i Image, uid, gid uint32) NewContainerOpts {
        return withRemappedSnapshotBase(id, i, uid, gid, true)
}

func withRemappedSnapshotBase(id string, i Image, uid, gid uint32, readonly bool) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                diffIDs, err := i.(*image).i.RootFS(ctx, client.ContentStore(), client.platform)
                if err != nil {
                        return err
                }

                var (
                        parent   = identity.ChainID(diffIDs).String()
                        usernsID = fmt.Sprintf("%s-%d-%d", parent, uid, gid)
                )
                c.Snapshotter, err = client.resolveSnapshotterName(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                snapshotter, err := client.getSnapshotter(ctx, c.Snapshotter)
                if err != nil {
                        return err
                }
                if _, err := snapshotter.Stat(ctx, usernsID); err == nil {
                        if _, err := snapshotter.Prepare(ctx, id, usernsID); err == nil {
                                c.SnapshotKey = id
                                c.Image = i.Name()
                                return nil
                        } else if !errdefs.IsNotFound(err) {
                                return err
                        }
                }
                mounts, err := snapshotter.Prepare(ctx, usernsID+"-remap", parent)
                if err != nil {
                        return err
                }
                if err := remapRootFS(ctx, mounts, uid, gid); err != nil {
                        snapshotter.Remove(ctx, usernsID)
                        return err
                }
                if err := snapshotter.Commit(ctx, usernsID, usernsID+"-remap"); err != nil {
                        return err
                }
                if readonly {
                        _, err = snapshotter.View(ctx, id, usernsID)
                } else {
                        _, err = snapshotter.Prepare(ctx, id, usernsID)
                }
                if err != nil {
                        return err
                }
                c.SnapshotKey = id
                c.Image = i.Name()
                return nil
        }
}

func remapRootFS(ctx context.Context, mounts []mount.Mount, uid, gid uint32) error {
        return mount.WithTempMount(ctx, mounts, func(root string) error {
                return filepath.Walk(root, incrementFS(root, uid, gid))
        })
}

func incrementFS(root string, uidInc, gidInc uint32) filepath.WalkFunc {
        return func(path string, info os.FileInfo, err error) error {
                if err != nil {
                        return err
                }
                var (
                        stat = info.Sys().(*syscall.Stat_t)
                        u, g = int(stat.Uid + uidInc), int(stat.Gid + gidInc)
                )
                // be sure the lchown the path as to not de-reference the symlink to a host file
                return os.Lchown(path, u, g)
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/opencontainers/image-spec/identity"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
)

var (
        // ErrImageNameNotFoundInIndex is returned when the image name is not found in the index
        ErrImageNameNotFoundInIndex = errors.New("image name not found in index")
        // ErrRuntimeNameNotFoundInIndex is returned when the runtime is not found in the index
        ErrRuntimeNameNotFoundInIndex = errors.New("runtime not found in index")
        // ErrSnapshotterNameNotFoundInIndex is returned when the snapshotter is not found in the index
        ErrSnapshotterNameNotFoundInIndex = errors.New("snapshotter not found in index")
)

// RestoreOpts are options to manage the restore operation
type RestoreOpts func(context.Context, string, *Client, Image, *imagespec.Index) NewContainerOpts

// WithRestoreImage restores the image for the container
func WithRestoreImage(ctx context.Context, id string, client *Client, checkpoint Image, index *imagespec.Index) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                name, ok := index.Annotations[imagespec.AnnotationRefName]
                if !ok || name == "" {
                        return ErrImageNameNotFoundInIndex
                }
                snapshotter, ok := index.Annotations[checkpointSnapshotterNameLabel]
                if !ok || name == "" {
                        return ErrSnapshotterNameNotFoundInIndex
                }
                i, err := client.GetImage(ctx, name)
                if err != nil {
                        return err
                }

                diffIDs, err := i.(*image).i.RootFS(ctx, client.ContentStore(), client.platform)
                if err != nil {
                        return err
                }
                parent := identity.ChainID(diffIDs).String()
                if _, err := client.SnapshotService(snapshotter).Prepare(ctx, id, parent); err != nil {
                        return err
                }
                c.Image = i.Name()
                c.SnapshotKey = id
                c.Snapshotter = snapshotter
                return nil
        }
}

// WithRestoreRuntime restores the runtime for the container
func WithRestoreRuntime(ctx context.Context, id string, client *Client, checkpoint Image, index *imagespec.Index) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                name, ok := index.Annotations[checkpointRuntimeNameLabel]
                if !ok {
                        return ErrRuntimeNameNotFoundInIndex
                }

                // restore options if present
                m, err := GetIndexByMediaType(index, images.MediaTypeContainerd1CheckpointRuntimeOptions)
                if err != nil {
                        if err != ErrMediaTypeNotFound {
                                return err
                        }
                }
                var options ptypes.Any
                if m != nil {
                        store := client.ContentStore()
                        data, err := content.ReadBlob(ctx, store, *m)
                        if err != nil {
                                return fmt.Errorf("unable to read checkpoint runtime: %w", err)
                        }
                        if err := proto.Unmarshal(data, &options); err != nil {
                                return err
                        }
                }

                c.Runtime = containers.RuntimeInfo{
                        Name:    name,
                        Options: &options,
                }
                return nil
        }
}

// WithRestoreSpec restores the spec from the checkpoint for the container
func WithRestoreSpec(ctx context.Context, id string, client *Client, checkpoint Image, index *imagespec.Index) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                m, err := GetIndexByMediaType(index, images.MediaTypeContainerd1CheckpointConfig)
                if err != nil {
                        return err
                }
                store := client.ContentStore()
                data, err := content.ReadBlob(ctx, store, *m)
                if err != nil {
                        return fmt.Errorf("unable to read checkpoint config: %w", err)
                }
                var any ptypes.Any
                if err := proto.Unmarshal(data, &any); err != nil {
                        return err
                }
                c.Spec = &any
                return nil
        }
}

// WithRestoreRW restores the rw layer from the checkpoint for the container
func WithRestoreRW(ctx context.Context, id string, client *Client, checkpoint Image, index *imagespec.Index) NewContainerOpts {
        return func(ctx context.Context, client *Client, c *containers.Container) error {
                // apply rw layer
                rw, err := GetIndexByMediaType(index, imagespec.MediaTypeImageLayerGzip)
                if err != nil {
                        return err
                }
                mounts, err := client.SnapshotService(c.Snapshotter).Mounts(ctx, c.SnapshotKey)
                if err != nil {
                        return err
                }

                if _, err := client.DiffService().Apply(ctx, *rw, mounts); err != nil {
                        return err
                }
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"
        "io"

        containersapi "github.com/containerd/containerd/api/services/containers/v1"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
)

type remoteContainers struct {
        client containersapi.ContainersClient
}

var _ containers.Store = &remoteContainers{}

// NewRemoteContainerStore returns the container Store connected with the provided client
func NewRemoteContainerStore(client containersapi.ContainersClient) containers.Store {
        return &remoteContainers{
                client: client,
        }
}

func (r *remoteContainers) Get(ctx context.Context, id string) (containers.Container, error) {
        resp, err := r.client.Get(ctx, &containersapi.GetContainerRequest{
                ID: id,
        })
        if err != nil {
                return containers.Container{}, errdefs.FromGRPC(err)
        }

        return containerFromProto(resp.Container), nil
}

func (r *remoteContainers) List(ctx context.Context, filters ...string) ([]containers.Container, error) {
        containers, err := r.stream(ctx, filters...)
        if err != nil {
                if err == errStreamNotAvailable {
                        return r.list(ctx, filters...)
                }
                return nil, err
        }
        return containers, nil
}

func (r *remoteContainers) list(ctx context.Context, filters ...string) ([]containers.Container, error) {
        resp, err := r.client.List(ctx, &containersapi.ListContainersRequest{
                Filters: filters,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return containersFromProto(resp.Containers), nil
}

var errStreamNotAvailable = errors.New("streaming api not available")

func (r *remoteContainers) stream(ctx context.Context, filters ...string) ([]containers.Container, error) {
        session, err := r.client.ListStream(ctx, &containersapi.ListContainersRequest{
                Filters: filters,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        var containers []containers.Container
        for {
                c, err := session.Recv()
                if err != nil {
                        if err == io.EOF {
                                return containers, nil
                        }
                        if s, ok := status.FromError(err); ok {
                                if s.Code() == codes.Unimplemented {
                                        return nil, errStreamNotAvailable
                                }
                        }
                        return nil, errdefs.FromGRPC(err)
                }
                select {
                case <-ctx.Done():
                        return containers, ctx.Err()
                default:
                        containers = append(containers, containerFromProto(c.Container))
                }
        }
}

func (r *remoteContainers) Create(ctx context.Context, container containers.Container) (containers.Container, error) {
        created, err := r.client.Create(ctx, &containersapi.CreateContainerRequest{
                Container: containerToProto(&container),
        })
        if err != nil {
                return containers.Container{}, errdefs.FromGRPC(err)
        }

        return containerFromProto(created.Container), nil

}

func (r *remoteContainers) Update(ctx context.Context, container containers.Container, fieldpaths ...string) (containers.Container, error) {
        var updateMask *ptypes.FieldMask
        if len(fieldpaths) > 0 {
                updateMask = &ptypes.FieldMask{
                        Paths: fieldpaths,
                }
        }

        updated, err := r.client.Update(ctx, &containersapi.UpdateContainerRequest{
                Container:  containerToProto(&container),
                UpdateMask: updateMask,
        })
        if err != nil {
                return containers.Container{}, errdefs.FromGRPC(err)
        }

        return containerFromProto(updated.Container), nil

}

func (r *remoteContainers) Delete(ctx context.Context, id string) error {
        _, err := r.client.Delete(ctx, &containersapi.DeleteContainerRequest{
                ID: id,
        })

        return errdefs.FromGRPC(err)

}

func containerToProto(container *containers.Container) *containersapi.Container {
        extensions := make(map[string]*ptypes.Any)
        for k, v := range container.Extensions {
                extensions[k] = protobuf.FromAny(v)
        }
        return &containersapi.Container{
                ID:     container.ID,
                Labels: container.Labels,
                Image:  container.Image,
                Runtime: &containersapi.Container_Runtime{
                        Name:    container.Runtime.Name,
                        Options: protobuf.FromAny(container.Runtime.Options),
                },
                Spec:        protobuf.FromAny(container.Spec),
                Snapshotter: container.Snapshotter,
                SnapshotKey: container.SnapshotKey,
                Extensions:  extensions,
                Sandbox:     container.SandboxID,
        }
}

func containerFromProto(containerpb *containersapi.Container) containers.Container {
        var runtime containers.RuntimeInfo
        if containerpb.Runtime != nil {
                runtime = containers.RuntimeInfo{
                        Name:    containerpb.Runtime.Name,
                        Options: containerpb.Runtime.Options,
                }
        }
        extensions := make(map[string]typeurl.Any)
        for k, v := range containerpb.Extensions {
                v := v
                extensions[k] = v
        }
        return containers.Container{
                ID:          containerpb.ID,
                Labels:      containerpb.Labels,
                Image:       containerpb.Image,
                Runtime:     runtime,
                Spec:        containerpb.Spec,
                Snapshotter: containerpb.Snapshotter,
                SnapshotKey: containerpb.SnapshotKey,
                CreatedAt:   protobuf.FromTimestamp(containerpb.CreatedAt),
                UpdatedAt:   protobuf.FromTimestamp(containerpb.UpdatedAt),
                Extensions:  extensions,
                SandboxID:   containerpb.Sandbox,
        }
}

func containersFromProto(containerspb []*containersapi.Container) []containers.Container {
        var containers []containers.Container

        for _, container := range containerspb {
                container := container
                containers = append(containers, containerFromProto(container))
        }

        return containers
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        diffapi "github.com/containerd/containerd/api/services/diff/v1"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/diff/proxy"
)

// DiffService handles the computation and application of diffs
type DiffService interface {
        diff.Comparer
        diff.Applier
}

// NewDiffServiceFromClient returns a new diff service which communicates
// over a GRPC connection.
func NewDiffServiceFromClient(client diffapi.DiffClient) DiffService {
        return proxy.NewDiffApplier(client).(DiffService)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"

        eventsapi "github.com/containerd/containerd/api/services/events/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
)

// EventService handles the publish, forward and subscribe of events.
type EventService interface {
        events.Publisher
        events.Forwarder
        events.Subscriber
}

// NewEventServiceFromClient returns a new event service which communicates
// over a GRPC connection.
func NewEventServiceFromClient(client eventsapi.EventsClient) EventService {
        return &eventRemote{
                client: client,
        }
}

type eventRemote struct {
        client eventsapi.EventsClient
}

func (e *eventRemote) Publish(ctx context.Context, topic string, event events.Event) error {
        evt, err := typeurl.MarshalAny(event)
        if err != nil {
                return err
        }
        req := &eventsapi.PublishRequest{
                Topic: topic,
                Event: protobuf.FromAny(evt),
        }
        if _, err := e.client.Publish(ctx, req); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (e *eventRemote) Forward(ctx context.Context, envelope *events.Envelope) error {
        req := &eventsapi.ForwardRequest{
                Envelope: &types.Envelope{
                        Timestamp: protobuf.ToTimestamp(envelope.Timestamp),
                        Namespace: envelope.Namespace,
                        Topic:     envelope.Topic,
                        Event:     protobuf.FromAny(envelope.Event),
                },
        }
        if _, err := e.client.Forward(ctx, req); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (e *eventRemote) Subscribe(ctx context.Context, filters ...string) (ch <-chan *events.Envelope, errs <-chan error) {
        var (
                evq  = make(chan *events.Envelope)
                errq = make(chan error, 1)
        )

        errs = errq
        ch = evq

        session, err := e.client.Subscribe(ctx, &eventsapi.SubscribeRequest{
                Filters: filters,
        })
        if err != nil {
                errq <- err
                close(errq)
                return
        }

        go func() {
                defer close(errq)

                for {
                        ev, err := session.Recv()
                        if err != nil {
                                errq <- err
                                return
                        }

                        select {
                        case evq <- &events.Envelope{
                                Timestamp: protobuf.FromTimestamp(ev.Timestamp),
                                Namespace: ev.Namespace,
                                Topic:     ev.Topic,
                                Event:     ev.Event,
                        }:
                        case <-ctx.Done():
                                if cerr := ctx.Err(); cerr != context.Canceled {
                                        errq <- cerr
                                }
                                return
                        }
                }
        }()

        return ch, errs
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "io"

        "github.com/containerd/containerd/v2/core/images/archive"
)

// Export exports images to a Tar stream.
// The tar archive is in OCI format with a Docker compatible manifest
// when a single target platform is given.
func (c *Client) Export(ctx context.Context, w io.Writer, opts ...archive.ExportOpt) error {
        return archive.Export(ctx, c.ContentStore(), w, opts...)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"

        "github.com/containerd/containerd/v2/pkg/namespaces"
        "google.golang.org/grpc"
)

type namespaceInterceptor struct {
        namespace string
}

func (ni namespaceInterceptor) unary(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
        _, ok := namespaces.Namespace(ctx)
        if !ok {
                ctx = namespaces.WithNamespace(ctx, ni.namespace)
        }
        return invoker(ctx, method, req, reply, cc, opts...)
}

func (ni namespaceInterceptor) stream(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) {
        _, ok := namespaces.Namespace(ctx)
        if !ok {
                ctx = namespaces.WithNamespace(ctx, ni.namespace)
        }

        return streamer(ctx, desc, cc, method, opts...)
}

func newNSInterceptors(ns string) (grpc.UnaryClientInterceptor, grpc.StreamClientInterceptor) {
        ni := namespaceInterceptor{
                namespace: ns,
        }
        return grpc.UnaryClientInterceptor(ni.unary), grpc.StreamClientInterceptor(ni.stream)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/images/usage"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/kmutex"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/rootfs"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Image describes an image used by containers
type Image interface {
        // Name of the image
        Name() string
        // Target descriptor for the image content
        Target() ocispec.Descriptor
        // Labels of the image
        Labels() map[string]string
        // Unpack unpacks the image's content into a snapshot
        Unpack(context.Context, string, ...UnpackOpt) error
        // RootFS returns the unpacked diffids that make up images rootfs.
        RootFS(ctx context.Context) ([]digest.Digest, error)
        // Size returns the total size of the image's packed resources.
        Size(ctx context.Context) (int64, error)
        // Usage returns a usage calculation for the image.
        Usage(context.Context, ...UsageOpt) (int64, error)
        // Config descriptor for the image.
        Config(ctx context.Context) (ocispec.Descriptor, error)
        // IsUnpacked returns whether an image is unpacked.
        IsUnpacked(context.Context, string) (bool, error)
        // ContentStore provides a content store which contains image blob data
        ContentStore() content.Store
        // Metadata returns the underlying image metadata
        Metadata() images.Image
        // Platform returns the platform match comparer. Can be nil.
        Platform() platforms.MatchComparer
        // Spec returns the OCI image spec for a given image.
        Spec(ctx context.Context) (ocispec.Image, error)
}

type usageOptions struct {
        manifestLimit *int
        manifestOnly  bool
        snapshots     bool
}

// UsageOpt is used to configure the usage calculation
type UsageOpt func(*usageOptions) error

// WithUsageManifestLimit sets the limit to the number of manifests which will
// be walked for usage. Setting this value to 0 will require all manifests to
// be walked, returning ErrNotFound if manifests are missing.
// NOTE: By default all manifests which exist will be walked
// and any non-existent manifests and their subobjects will be ignored.
func WithUsageManifestLimit(i int) UsageOpt {
        // If 0 then don't filter any manifests
        // By default limits to current platform
        return func(o *usageOptions) error {
                o.manifestLimit = &i
                return nil
        }
}

// WithSnapshotUsage will check for referenced snapshots from the image objects
// and include the snapshot size in the total usage.
func WithSnapshotUsage() UsageOpt {
        return func(o *usageOptions) error {
                o.snapshots = true
                return nil
        }
}

// WithManifestUsage is used to get the usage for an image based on what is
// reported by the manifests rather than what exists in the content store.
// NOTE: This function is best used with the manifest limit set to get a
// consistent value, otherwise non-existent manifests will be excluded.
func WithManifestUsage() UsageOpt {
        return func(o *usageOptions) error {
                o.manifestOnly = true
                return nil
        }
}

var _ = (Image)(&image{})

// NewImage returns a client image object from the metadata image
func NewImage(client *Client, i images.Image) Image {
        return &image{
                client:   client,
                i:        i,
                platform: client.platform,
        }
}

// NewImageWithPlatform returns a client image object from the metadata image
func NewImageWithPlatform(client *Client, i images.Image, platform platforms.MatchComparer) Image {
        return &image{
                client:   client,
                i:        i,
                platform: platform,
        }
}

type image struct {
        client *Client

        i        images.Image
        platform platforms.MatchComparer
        diffIDs  []digest.Digest

        mu sync.Mutex
}

func (i *image) Metadata() images.Image {
        return i.i
}

func (i *image) Name() string {
        return i.i.Name
}

func (i *image) Target() ocispec.Descriptor {
        return i.i.Target
}

func (i *image) Labels() map[string]string {
        return i.i.Labels
}

func (i *image) RootFS(ctx context.Context) ([]digest.Digest, error) {
        i.mu.Lock()
        defer i.mu.Unlock()
        if i.diffIDs != nil {
                return i.diffIDs, nil
        }

        provider := i.client.ContentStore()
        diffIDs, err := i.i.RootFS(ctx, provider, i.platform)
        if err != nil {
                return nil, err
        }
        i.diffIDs = diffIDs
        return diffIDs, nil
}

func (i *image) Size(ctx context.Context) (int64, error) {
        return usage.CalculateImageUsage(ctx, i.i, i.client.ContentStore(), usage.WithManifestLimit(i.platform, 1), usage.WithManifestUsage())
}

func (i *image) Usage(ctx context.Context, opts ...UsageOpt) (int64, error) {
        var config usageOptions
        for _, opt := range opts {
                if err := opt(&config); err != nil {
                        return 0, err
                }
        }

        var usageOpts []usage.Opt
        if config.manifestLimit != nil {
                usageOpts = append(usageOpts, usage.WithManifestLimit(i.platform, *config.manifestLimit))
        }
        if config.snapshots {
                usageOpts = append(usageOpts, usage.WithSnapshotters(i.client.SnapshotService))
        }
        if config.manifestOnly {
                usageOpts = append(usageOpts, usage.WithManifestUsage())
        }

        return usage.CalculateImageUsage(ctx, i.i, i.client.ContentStore(), usageOpts...)
}

func (i *image) Config(ctx context.Context) (ocispec.Descriptor, error) {
        provider := i.client.ContentStore()
        return i.i.Config(ctx, provider, i.platform)
}

func (i *image) IsUnpacked(ctx context.Context, snapshotterName string) (bool, error) {
        sn, err := i.client.getSnapshotter(ctx, snapshotterName)
        if err != nil {
                return false, err
        }

        diffs, err := i.RootFS(ctx)
        if err != nil {
                return false, err
        }

        if _, err := sn.Stat(ctx, identity.ChainID(diffs).String()); err != nil {
                if errdefs.IsNotFound(err) {
                        return false, nil
                }
                return false, err
        }

        return true, nil
}

func (i *image) Spec(ctx context.Context) (ocispec.Image, error) {
        var ociImage ocispec.Image

        desc, err := i.Config(ctx)
        if err != nil {
                return ociImage, fmt.Errorf("get image config descriptor: %w", err)
        }

        blob, err := content.ReadBlob(ctx, i.ContentStore(), desc)
        if err != nil {
                return ociImage, fmt.Errorf("read image config from content store: %w", err)
        }

        if err := json.Unmarshal(blob, &ociImage); err != nil {
                return ociImage, fmt.Errorf("unmarshal image config %s: %w", blob, err)
        }

        return ociImage, nil
}

// UnpackConfig provides configuration for the unpack of an image
type UnpackConfig struct {
        // ApplyOpts for applying a diff to a snapshotter
        ApplyOpts []diff.ApplyOpt
        // SnapshotOpts for configuring a snapshotter
        SnapshotOpts []snapshots.Opt
        // CheckPlatformSupported is whether to validate that a snapshotter
        // supports an image's platform before unpacking
        CheckPlatformSupported bool
        // DuplicationSuppressor is used to make sure that there is only one
        // in-flight fetch request or unpack handler for a given descriptor's
        // digest or chain ID.
        DuplicationSuppressor kmutex.KeyedLocker
}

// UnpackOpt provides configuration for unpack
type UnpackOpt func(context.Context, *UnpackConfig) error

// WithSnapshotterPlatformCheck sets `CheckPlatformSupported` on the UnpackConfig
func WithSnapshotterPlatformCheck() UnpackOpt {
        return func(ctx context.Context, uc *UnpackConfig) error {
                uc.CheckPlatformSupported = true
                return nil
        }
}

// WithUnpackDuplicationSuppressor sets `DuplicationSuppressor` on the UnpackConfig.
func WithUnpackDuplicationSuppressor(suppressor kmutex.KeyedLocker) UnpackOpt {
        return func(ctx context.Context, uc *UnpackConfig) error {
                uc.DuplicationSuppressor = suppressor
                return nil
        }
}

// WithUnpackApplyOpts appends new apply options on the UnpackConfig.
func WithUnpackApplyOpts(opts ...diff.ApplyOpt) UnpackOpt {
        return func(ctx context.Context, uc *UnpackConfig) error {
                uc.ApplyOpts = append(uc.ApplyOpts, opts...)
                return nil
        }
}

func (i *image) Unpack(ctx context.Context, snapshotterName string, opts ...UnpackOpt) error {
        ctx, done, err := i.client.WithLease(ctx)
        if err != nil {
                return err
        }
        defer done(ctx)

        var config UnpackConfig
        for _, o := range opts {
                if err := o(ctx, &config); err != nil {
                        return err
                }
        }

        manifest, err := i.getManifest(ctx, i.platform)
        if err != nil {
                return err
        }

        layers, err := i.getLayers(ctx, manifest)
        if err != nil {
                return err
        }

        var (
                a  = i.client.DiffService()
                cs = i.client.ContentStore()

                chain    []digest.Digest
                unpacked bool
        )
        snapshotterName, err = i.client.resolveSnapshotterName(ctx, snapshotterName)
        if err != nil {
                return err
        }
        sn, err := i.client.getSnapshotter(ctx, snapshotterName)
        if err != nil {
                return err
        }
        if config.CheckPlatformSupported {
                if err := i.checkSnapshotterSupport(ctx, snapshotterName, manifest); err != nil {
                        return err
                }
        }

        for _, layer := range layers {
                unpacked, err = rootfs.ApplyLayerWithOpts(ctx, layer, chain, sn, a, config.SnapshotOpts, config.ApplyOpts)
                if err != nil {
                        return fmt.Errorf("apply layer error for %q: %w", i.Name(), err)
                }

                if unpacked {
                        // Set the uncompressed label after the uncompressed
                        // digest has been verified through apply.
                        cinfo := content.Info{
                                Digest: layer.Blob.Digest,
                                Labels: map[string]string{
                                        labels.LabelUncompressed: layer.Diff.Digest.String(),
                                },
                        }
                        if _, err := cs.Update(ctx, cinfo, "labels."+labels.LabelUncompressed); err != nil {
                                return err
                        }
                }

                chain = append(chain, layer.Diff.Digest)
        }

        desc, err := i.i.Config(ctx, cs, i.platform)
        if err != nil {
                return err
        }

        rootFS := identity.ChainID(chain).String()

        cinfo := content.Info{
                Digest: desc.Digest,
                Labels: map[string]string{
                        fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootFS,
                },
        }

        _, err = cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName))
        return err
}

func (i *image) getManifest(ctx context.Context, platform platforms.MatchComparer) (ocispec.Manifest, error) {
        cs := i.ContentStore()
        manifest, err := images.Manifest(ctx, cs, i.i.Target, platform)
        if err != nil {
                return ocispec.Manifest{}, err
        }
        return manifest, nil
}

func (i *image) getLayers(ctx context.Context, manifest ocispec.Manifest) ([]rootfs.Layer, error) {
        diffIDs, err := i.RootFS(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to resolve rootfs: %w", err)
        }

        // parse out the image layers from oci artifact layers
        imageLayers := []ocispec.Descriptor{}
        for _, ociLayer := range manifest.Layers {
                if images.IsLayerType(ociLayer.MediaType) {
                        imageLayers = append(imageLayers, ociLayer)
                }
        }
        if len(diffIDs) != len(imageLayers) {
                return nil, errors.New("mismatched image rootfs and manifest layers")
        }
        layers := make([]rootfs.Layer, len(diffIDs))
        for i := range diffIDs {
                layers[i].Diff = ocispec.Descriptor{
                        // TODO: derive media type from compressed type
                        MediaType: ocispec.MediaTypeImageLayer,
                        Digest:    diffIDs[i],
                }
                layers[i].Blob = imageLayers[i]
        }
        return layers, nil
}

func (i *image) checkSnapshotterSupport(ctx context.Context, snapshotterName string, manifest ocispec.Manifest) error {
        snapshotterPlatformMatcher, err := i.client.GetSnapshotterSupportedPlatforms(ctx, snapshotterName)
        if err != nil {
                return err
        }

        manifestPlatform, err := images.ConfigPlatform(ctx, i.ContentStore(), manifest.Config)
        if err != nil {
                return err
        }

        if snapshotterPlatformMatcher.Match(manifestPlatform) {
                return nil
        }
        return fmt.Errorf("snapshotter %s does not support platform %s for image %s", snapshotterName, manifestPlatform, manifest.Config.Digest)
}

func (i *image) ContentStore() content.Store {
        return i.client.ContentStore()
}

func (i *image) Platform() platforms.MatchComparer {
        return i.platform
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"

        imagesapi "github.com/containerd/containerd/api/services/images/v1"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/epoch"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        "google.golang.org/protobuf/types/known/timestamppb"
)

type remoteImages struct {
        client imagesapi.ImagesClient
}

// NewImageStoreFromClient returns a new image store client
func NewImageStoreFromClient(client imagesapi.ImagesClient) images.Store {
        return &remoteImages{
                client: client,
        }
}

func (s *remoteImages) Get(ctx context.Context, name string) (images.Image, error) {
        resp, err := s.client.Get(ctx, &imagesapi.GetImageRequest{
                Name: name,
        })
        if err != nil {
                return images.Image{}, errdefs.FromGRPC(err)
        }

        return imageFromProto(resp.Image), nil
}

func (s *remoteImages) List(ctx context.Context, filters ...string) ([]images.Image, error) {
        resp, err := s.client.List(ctx, &imagesapi.ListImagesRequest{
                Filters: filters,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        return imagesFromProto(resp.Images), nil
}

func (s *remoteImages) Create(ctx context.Context, image images.Image) (images.Image, error) {
        req := &imagesapi.CreateImageRequest{
                Image: imageToProto(&image),
        }
        if tm := epoch.FromContext(ctx); tm != nil {
                req.SourceDateEpoch = timestamppb.New(*tm)
        }
        created, err := s.client.Create(ctx, req)
        if err != nil {
                return images.Image{}, errdefs.FromGRPC(err)
        }

        return imageFromProto(created.Image), nil
}

func (s *remoteImages) Update(ctx context.Context, image images.Image, fieldpaths ...string) (images.Image, error) {
        var updateMask *ptypes.FieldMask
        if len(fieldpaths) > 0 {
                updateMask = &ptypes.FieldMask{
                        Paths: fieldpaths,
                }
        }
        req := &imagesapi.UpdateImageRequest{
                Image:      imageToProto(&image),
                UpdateMask: updateMask,
        }
        if tm := epoch.FromContext(ctx); tm != nil {
                req.SourceDateEpoch = timestamppb.New(*tm)
        }
        updated, err := s.client.Update(ctx, req)
        if err != nil {
                return images.Image{}, errdefs.FromGRPC(err)
        }

        return imageFromProto(updated.Image), nil
}

func (s *remoteImages) Delete(ctx context.Context, name string, opts ...images.DeleteOpt) error {
        var do images.DeleteOptions
        for _, opt := range opts {
                if err := opt(ctx, &do); err != nil {
                        return err
                }
        }
        req := &imagesapi.DeleteImageRequest{
                Name: name,
                Sync: do.Synchronous,
        }
        if do.Target != nil {
                req.Target = oci.DescriptorToProto(*do.Target)
        }
        _, err := s.client.Delete(ctx, req)
        return errdefs.FromGRPC(err)
}

func imageToProto(image *images.Image) *imagesapi.Image {
        return &imagesapi.Image{
                Name:      image.Name,
                Labels:    image.Labels,
                Target:    oci.DescriptorToProto(image.Target),
                CreatedAt: protobuf.ToTimestamp(image.CreatedAt),
                UpdatedAt: protobuf.ToTimestamp(image.UpdatedAt),
        }
}

func imageFromProto(imagepb *imagesapi.Image) images.Image {
        return images.Image{
                Name:      imagepb.Name,
                Labels:    imagepb.Labels,
                Target:    oci.DescriptorFromProto(imagepb.Target),
                CreatedAt: protobuf.FromTimestamp(imagepb.CreatedAt),
                UpdatedAt: protobuf.FromTimestamp(imagepb.UpdatedAt),
        }
}

func imagesFromProto(imagespb []*imagesapi.Image) []images.Image {
        var images []images.Image

        for _, image := range imagespb {
                image := image
                images = append(images, imageFromProto(image))
        }

        return images
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "io"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/images/archive"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type importOpts struct {
        indexName       string
        imageRefT       func(string) string
        dgstRefT        func(digest.Digest) string
        skipDgstRef     func(string) bool
        allPlatforms    bool
        platformMatcher platforms.MatchComparer
        compress        bool
        discardLayers   bool
        skipMissing     bool
        imageLabels     map[string]string
}

// ImportOpt allows the caller to specify import specific options
type ImportOpt func(*importOpts) error

// WithImageRefTranslator is used to translate the index reference
// to an image reference for the image store.
func WithImageRefTranslator(f func(string) string) ImportOpt {
        return func(c *importOpts) error {
                c.imageRefT = f
                return nil
        }
}

// WithImageLabels are the image labels to apply to a new image
func WithImageLabels(labels map[string]string) ImportOpt {
        return func(c *importOpts) error {
                c.imageLabels = labels
                return nil
        }
}

// WithDigestRef is used to create digest images for each
// manifest in the index.
func WithDigestRef(f func(digest.Digest) string) ImportOpt {
        return func(c *importOpts) error {
                c.dgstRefT = f
                return nil
        }
}

// WithSkipDigestRef is used to specify when to skip applying
// WithDigestRef. The callback receives an image reference (or an empty
// string if not specified in the image). When the callback returns true,
// the skip occurs.
func WithSkipDigestRef(f func(string) bool) ImportOpt {
        return func(c *importOpts) error {
                c.skipDgstRef = f
                return nil
        }
}

// WithIndexName creates a tag pointing to the imported index
func WithIndexName(name string) ImportOpt {
        return func(c *importOpts) error {
                c.indexName = name
                return nil
        }
}

// WithAllPlatforms is used to import content for all platforms.
func WithAllPlatforms(allPlatforms bool) ImportOpt {
        return func(c *importOpts) error {
                c.allPlatforms = allPlatforms
                return nil
        }
}

// WithImportPlatform is used to import content for specific platform.
func WithImportPlatform(platformMacher platforms.MatchComparer) ImportOpt {
        return func(c *importOpts) error {
                c.platformMatcher = platformMacher
                return nil
        }
}

// WithImportCompression compresses uncompressed layers on import.
// This is used for import formats which do not include the manifest.
func WithImportCompression() ImportOpt {
        return func(c *importOpts) error {
                c.compress = true
                return nil
        }
}

// WithDiscardUnpackedLayers allows the garbage collector to clean up
// layers from content store after unpacking.
func WithDiscardUnpackedLayers() ImportOpt {
        return func(c *importOpts) error {
                c.discardLayers = true
                return nil
        }
}

// WithSkipMissing allows to import an archive which doesn't contain all the
// referenced blobs.
func WithSkipMissing() ImportOpt {
        return func(c *importOpts) error {
                c.skipMissing = true
                return nil
        }
}

// Import imports an image from a Tar stream using reader.
// Caller needs to specify importer. Future version may use oci.v1 as the default.
// Note that unreferenced blobs may be imported to the content store as well.
func (c *Client) Import(ctx context.Context, reader io.Reader, opts ...ImportOpt) ([]images.Image, error) {
        var iopts importOpts
        for _, o := range opts {
                if err := o(&iopts); err != nil {
                        return nil, err
                }
        }

        ctx, done, err := c.WithLease(ctx)
        if err != nil {
                return nil, err
        }
        defer done(ctx)

        var aio []archive.ImportOpt
        if iopts.compress {
                aio = append(aio, archive.WithImportCompression())
        }

        index, err := archive.ImportIndex(ctx, c.ContentStore(), reader, aio...)
        if err != nil {
                return nil, err
        }

        var (
                imgs []images.Image
                cs   = c.ContentStore()
                is   = c.ImageService()
        )

        if iopts.indexName != "" {
                imgs = append(imgs, images.Image{
                        Name:   iopts.indexName,
                        Target: index,
                })
        }
        var platformMatcher = c.platform
        if iopts.allPlatforms {
                platformMatcher = platforms.All
        } else if iopts.platformMatcher != nil {
                platformMatcher = iopts.platformMatcher
        }

        var handler images.HandlerFunc = func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                // Only save images at top level
                if desc.Digest != index.Digest {
                        // Don't set labels on missing content.
                        children, err := images.Children(ctx, cs, desc)
                        if iopts.skipMissing && errdefs.IsNotFound(err) {
                                return nil, images.ErrSkipDesc
                        }
                        return children, err
                }

                idx, err := decodeIndex(ctx, cs, desc)
                if err != nil {
                        return nil, err
                }

                for _, m := range idx.Manifests {
                        name := imageName(m.Annotations, iopts.imageRefT)
                        if name != "" {
                                imgs = append(imgs, images.Image{
                                        Name:   name,
                                        Target: m,
                                })
                        }
                        if iopts.skipDgstRef != nil {
                                if iopts.skipDgstRef(name) {
                                        continue
                                }
                        }
                        if iopts.dgstRefT != nil {
                                ref := iopts.dgstRefT(m.Digest)
                                if ref != "" {
                                        imgs = append(imgs, images.Image{
                                                Name:   ref,
                                                Target: m,
                                        })
                                }
                        }
                }

                return idx.Manifests, nil
        }

        handler = images.FilterPlatforms(handler, platformMatcher)
        if iopts.discardLayers {
                handler = images.SetChildrenMappedLabels(cs, handler, images.ChildGCLabelsFilterLayers)
        } else {
                handler = images.SetChildrenLabels(cs, handler)
        }
        if err := images.WalkNotEmpty(ctx, handler, index); err != nil {
                return nil, err
        }

        for i := range imgs {
                fieldsPath := []string{"target"}
                if iopts.imageLabels != nil {
                        fieldsPath = append(fieldsPath, "labels")
                        imgs[i].Labels = iopts.imageLabels
                }
                img, err := is.Update(ctx, imgs[i], fieldsPath...)
                if err != nil {
                        if !errdefs.IsNotFound(err) {
                                return nil, err
                        }

                        img, err = is.Create(ctx, imgs[i])
                        if err != nil {
                                return nil, err
                        }
                }
                imgs[i] = img
        }

        return imgs, nil
}

func imageName(annotations map[string]string, ociCleanup func(string) string) string {
        name := annotations[images.AnnotationImageName]
        if name != "" {
                return name
        }
        name = annotations[ocispec.AnnotationRefName]
        if name != "" {
                if ociCleanup != nil {
                        name = ociCleanup(name)
                }
        }
        return name
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "archive/tar"
        "context"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "runtime"
        "strings"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/archive"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
)

// Install a binary image into the opt service.
// More info: https://github.com/containerd/containerd/blob/main/docs/managed-opt.md.
func (c *Client) Install(ctx context.Context, image Image, opts ...InstallOpts) error {
        var config InstallConfig
        for _, o := range opts {
                o(&config)
        }
        path, err := c.getInstallPath(ctx, config)
        if err != nil {
                return err
        }
        var (
                cs       = image.ContentStore()
                platform = c.platform
        )
        manifest, err := images.Manifest(ctx, cs, image.Target(), platform)
        if err != nil {
                return err
        }

        var binDir, libDir string
        if runtime.GOOS == "windows" {
                binDir = "Files\\bin"
                libDir = "Files\\lib"
        } else {
                binDir = "bin"
                libDir = "lib"
        }
        for _, layer := range manifest.Layers {
                ra, err := cs.ReaderAt(ctx, layer)
                if err != nil {
                        return err
                }
                cr := content.NewReader(ra)
                r, err := compression.DecompressStream(cr)
                if err != nil {
                        ra.Close()
                        return err
                }

                filter := archive.WithFilter(func(hdr *tar.Header) (bool, error) {
                        d := filepath.Dir(hdr.Name)
                        result := d == binDir

                        if config.Libs {
                                result = result || d == libDir
                        }

                        if runtime.GOOS == "windows" {
                                hdr.Name = strings.Replace(hdr.Name, "Files", "", 1)
                        }
                        if result && !config.Replace {
                                if _, err := os.Lstat(filepath.Join(path, hdr.Name)); err == nil {
                                        return false, fmt.Errorf("cannot replace %s in %s", hdr.Name, path)
                                }
                        }
                        return result, nil
                })

                opts := []archive.ApplyOpt{filter}

                if runtime.GOOS == "windows" {
                        opts = append(opts, archive.WithNoSameOwner())
                }

                if _, err := archive.Apply(ctx, path, r, opts...); err != nil {
                        r.Close()
                        ra.Close()
                        return err
                }
                r.Close()
                ra.Close()
        }
        return nil
}

func (c *Client) getInstallPath(ctx context.Context, config InstallConfig) (string, error) {
        if config.Path != "" {
                return config.Path, nil
        }
        resp, err := c.IntrospectionService().Plugins(ctx, "id==opt")
        if err != nil {
                return "", err
        }
        if len(resp.Plugins) != 1 {
                return "", errors.New("opt service not enabled")
        }
        path := resp.Plugins[0].Exports["path"]
        if path == "" {
                return "", errors.New("opt path not exported")
        }
        return path, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

// InstallOpts configures binary installs
type InstallOpts func(*InstallConfig)

// InstallConfig sets the binary install configuration
type InstallConfig struct {
        // Libs installs libs from the image
        Libs bool
        // Replace will overwrite existing binaries or libs in the opt directory
        Replace bool
        // Path to install libs and binaries to
        Path string
}

// WithInstallLibs installs libs from the image
func WithInstallLibs(c *InstallConfig) {
        c.Libs = true
}

// WithInstallReplace will replace existing files
func WithInstallReplace(c *InstallConfig) {
        c.Replace = true
}

// WithInstallPath sets the optional install path
func WithInstallPath(path string) InstallOpts {
        return func(c *InstallConfig) {
                c.Path = path
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "time"

        "github.com/containerd/containerd/v2/core/leases"
)

// WithLease attaches a lease on the context
func (c *Client) WithLease(ctx context.Context, opts ...leases.Opt) (context.Context, func(context.Context) error, error) {
        nop := func(context.Context) error { return nil }

        _, ok := leases.FromContext(ctx)
        if ok {
                return ctx, nop, nil
        }

        ls := c.LeasesService()

        if len(opts) == 0 {
                // Use default lease configuration if no options provided
                opts = []leases.Opt{
                        leases.WithRandomID(),
                        leases.WithExpiration(24 * time.Hour),
                }
        }

        l, err := ls.Create(ctx, opts...)
        if err != nil {
                return ctx, nop, err
        }

        ctx = leases.WithLease(ctx, l.ID)
        return ctx, func(ctx context.Context) error {
                return ls.Delete(ctx, l)
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "strings"

        api "github.com/containerd/containerd/api/services/namespaces/v1"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
)

// NewNamespaceStoreFromClient returns a new namespace store
func NewNamespaceStoreFromClient(client api.NamespacesClient) namespaces.Store {
        return &remoteNamespaces{client: client}
}

type remoteNamespaces struct {
        client api.NamespacesClient
}

func (r *remoteNamespaces) Create(ctx context.Context, namespace string, labels map[string]string) error {
        var req api.CreateNamespaceRequest

        req.Namespace = &api.Namespace{
                Name:   namespace,
                Labels: labels,
        }

        _, err := r.client.Create(ctx, &req)
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

func (r *remoteNamespaces) Labels(ctx context.Context, namespace string) (map[string]string, error) {
        var req api.GetNamespaceRequest
        req.Name = namespace

        resp, err := r.client.Get(ctx, &req)
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        return resp.Namespace.Labels, nil
}

func (r *remoteNamespaces) SetLabel(ctx context.Context, namespace, key, value string) error {
        var req api.UpdateNamespaceRequest

        req.Namespace = &api.Namespace{
                Name:   namespace,
                Labels: map[string]string{key: value},
        }

        req.UpdateMask = &types.FieldMask{
                Paths: []string{strings.Join([]string{"labels", key}, ".")},
        }

        _, err := r.client.Update(ctx, &req)
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

func (r *remoteNamespaces) List(ctx context.Context) ([]string, error) {
        var req api.ListNamespacesRequest

        resp, err := r.client.List(ctx, &req)
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        var namespaces []string

        for _, ns := range resp.Namespaces {
                namespaces = append(namespaces, ns.Name)
        }

        return namespaces, nil
}

func (r *remoteNamespaces) Delete(ctx context.Context, namespace string, opts ...namespaces.DeleteOpts) error {
        i := namespaces.DeleteInfo{
                Name: namespace,
        }
        for _, o := range opts {
                if err := o(ctx, &i); err != nil {
                        return err
                }
        }
        req := api.DeleteNamespaceRequest{
                Name: namespace,
        }
        _, err := r.client.Delete(ctx, &req)
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "fmt"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
)

// Process represents a system process
type Process interface {
        // ID of the process
        ID() string
        // Pid is the system specific process id
        Pid() uint32
        // Start starts the process executing the user's defined binary
        Start(context.Context) error
        // Delete removes the process and any resources allocated returning the exit status
        Delete(context.Context, ...ProcessDeleteOpts) (*ExitStatus, error)
        // Kill sends the provided signal to the process
        Kill(context.Context, syscall.Signal, ...KillOpts) error
        // Wait asynchronously waits for the process to exit, and sends the exit code to the returned channel
        Wait(context.Context) (<-chan ExitStatus, error)
        // CloseIO allows various pipes to be closed on the process
        CloseIO(context.Context, ...IOCloserOpts) error
        // Resize changes the width and height of the process's terminal
        Resize(ctx context.Context, w, h uint32) error
        // IO returns the io set for the process
        IO() cio.IO
        // Status returns the executing status of the process
        Status(context.Context) (Status, error)
}

// NewExitStatus populates an ExitStatus
func NewExitStatus(code uint32, t time.Time, err error) *ExitStatus {
        return &ExitStatus{
                code:     code,
                exitedAt: t,
                err:      err,
        }
}

// ExitStatus encapsulates a process's exit status.
// It is used by `Wait()` to return either a process exit code or an error
type ExitStatus struct {
        code     uint32
        exitedAt time.Time
        err      error
}

// Result returns the exit code and time of the exit status.
// An error may be returned here to which indicates there was an error
//
//        at some point while waiting for the exit status. It does not signify
//        an error with the process itself.
//
// If an error is returned, the process may still be running.
func (s ExitStatus) Result() (uint32, time.Time, error) {
        return s.code, s.exitedAt, s.err
}

// ExitCode returns the exit code of the process.
// This is only valid if Error() returns nil.
func (s ExitStatus) ExitCode() uint32 {
        return s.code
}

// ExitTime returns the exit time of the process
// This is only valid if Error() returns nil.
func (s ExitStatus) ExitTime() time.Time {
        return s.exitedAt
}

// Error returns the error, if any, that occurred while waiting for the
// process.
func (s ExitStatus) Error() error {
        return s.err
}

type process struct {
        id   string
        task *task
        pid  uint32
        io   cio.IO
}

func (p *process) ID() string {
        return p.id
}

// Pid returns the pid of the process
// The pid is not set until start is called and returns
func (p *process) Pid() uint32 {
        return p.pid
}

// Start starts the exec process
func (p *process) Start(ctx context.Context) error {
        r, err := p.task.client.TaskService().Start(ctx, &tasks.StartRequest{
                ContainerID: p.task.id,
                ExecID:      p.id,
        })
        if err != nil {
                if p.io != nil {
                        p.io.Cancel()
                        p.io.Wait()
                        p.io.Close()
                }
                return errdefs.FromGRPC(err)
        }
        p.pid = r.Pid
        return nil
}

func (p *process) Kill(ctx context.Context, s syscall.Signal, opts ...KillOpts) error {
        var i KillInfo
        for _, o := range opts {
                if err := o(ctx, &i); err != nil {
                        return err
                }
        }
        _, err := p.task.client.TaskService().Kill(ctx, &tasks.KillRequest{
                Signal:      uint32(s),
                ContainerID: p.task.id,
                ExecID:      p.id,
                All:         i.All,
        })
        return errdefs.FromGRPC(err)
}

func (p *process) Wait(ctx context.Context) (<-chan ExitStatus, error) {
        c := make(chan ExitStatus, 1)
        go func() {
                defer close(c)
                r, err := p.task.client.TaskService().Wait(ctx, &tasks.WaitRequest{
                        ContainerID: p.task.id,
                        ExecID:      p.id,
                })
                if err != nil {
                        c <- ExitStatus{
                                code: UnknownExitStatus,
                                err:  err,
                        }
                        return
                }
                c <- ExitStatus{
                        code:     r.ExitStatus,
                        exitedAt: protobuf.FromTimestamp(r.ExitedAt),
                }
        }()
        return c, nil
}

func (p *process) CloseIO(ctx context.Context, opts ...IOCloserOpts) error {
        r := &tasks.CloseIORequest{
                ContainerID: p.task.id,
                ExecID:      p.id,
        }
        var i IOCloseInfo
        for _, o := range opts {
                o(&i)
        }
        r.Stdin = i.Stdin
        _, err := p.task.client.TaskService().CloseIO(ctx, r)
        return errdefs.FromGRPC(err)
}

func (p *process) IO() cio.IO {
        return p.io
}

func (p *process) Resize(ctx context.Context, w, h uint32) error {
        _, err := p.task.client.TaskService().ResizePty(ctx, &tasks.ResizePtyRequest{
                ContainerID: p.task.id,
                Width:       w,
                Height:      h,
                ExecID:      p.id,
        })
        return errdefs.FromGRPC(err)
}

func (p *process) Delete(ctx context.Context, opts ...ProcessDeleteOpts) (*ExitStatus, error) {
        for _, o := range opts {
                if err := o(ctx, p); err != nil {
                        return nil, err
                }
        }
        status, err := p.Status(ctx)
        if err != nil {
                return nil, err
        }
        switch status.Status {
        case Running, Paused, Pausing:
                return nil, fmt.Errorf("current process state: %s, process must be stopped before deletion: %w", status.Status, errdefs.ErrFailedPrecondition)
        }
        r, err := p.task.client.TaskService().DeleteProcess(ctx, &tasks.DeleteProcessRequest{
                ContainerID: p.task.id,
                ExecID:      p.id,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        if p.io != nil {
                p.io.Cancel()
                p.io.Wait()
                p.io.Close()
        }
        return &ExitStatus{code: r.ExitStatus, exitedAt: protobuf.FromTimestamp(r.ExitedAt)}, nil
}

func (p *process) Status(ctx context.Context) (Status, error) {
        r, err := p.task.client.TaskService().Get(ctx, &tasks.GetRequest{
                ContainerID: p.task.id,
                ExecID:      p.id,
        })
        if err != nil {
                return Status{}, errdefs.FromGRPC(err)
        }
        return Status{
                Status:     ProcessStatus(strings.ToLower(r.Process.Status.String())),
                ExitStatus: r.Process.ExitStatus,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"
        "fmt"

        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "golang.org/x/sync/semaphore"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/containerd/v2/core/remotes/docker/schema1" //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
        "github.com/containerd/containerd/v2/core/unpack"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
)

const (
        pullSpanPrefix = "pull"
)

// Pull downloads the provided content into containerd's content store
// and returns a platform specific image object
func (c *Client) Pull(ctx context.Context, ref string, opts ...RemoteOpt) (_ Image, retErr error) {
        ctx, span := tracing.StartSpan(ctx, tracing.Name(pullSpanPrefix, "Pull"))
        defer span.End()

        pullCtx := defaultRemoteContext()

        for _, o := range opts {
                if err := o(c, pullCtx); err != nil {
                        return nil, err
                }
        }

        if pullCtx.PlatformMatcher == nil {
                if len(pullCtx.Platforms) > 1 {
                        return nil, errors.New("cannot pull multiplatform image locally, try Fetch")
                } else if len(pullCtx.Platforms) == 0 {
                        pullCtx.PlatformMatcher = c.platform
                } else {
                        p, err := platforms.Parse(pullCtx.Platforms[0])
                        if err != nil {
                                return nil, fmt.Errorf("invalid platform %s: %w", pullCtx.Platforms[0], err)
                        }

                        pullCtx.PlatformMatcher = platforms.Only(p)
                }
        }

        span.SetAttributes(
                tracing.Attribute("image.ref", ref),
                tracing.Attribute("unpack", pullCtx.Unpack),
                tracing.Attribute("max.concurrent.downloads", pullCtx.MaxConcurrentDownloads),
                tracing.Attribute("platforms.count", len(pullCtx.Platforms)),
        )

        ctx, done, err := c.WithLease(ctx)
        if err != nil {
                return nil, err
        }
        defer done(ctx)

        var unpacker *unpack.Unpacker

        if pullCtx.Unpack {
                snapshotterName, err := c.resolveSnapshotterName(ctx, pullCtx.Snapshotter)
                if err != nil {
                        return nil, fmt.Errorf("unable to resolve snapshotter: %w", err)
                }
                span.SetAttributes(tracing.Attribute("snapshotter.name", snapshotterName))
                var uconfig UnpackConfig
                for _, opt := range pullCtx.UnpackOpts {
                        if err := opt(ctx, &uconfig); err != nil {
                                return nil, err
                        }
                }
                var platformMatcher platforms.Matcher
                if !uconfig.CheckPlatformSupported {
                        platformMatcher = platforms.All
                }

                // Check client Unpack config
                platform := unpack.Platform{
                        Platform:       platformMatcher,
                        SnapshotterKey: snapshotterName,
                        Snapshotter:    c.SnapshotService(snapshotterName),
                        SnapshotOpts:   append(pullCtx.SnapshotterOpts, uconfig.SnapshotOpts...),
                        Applier:        c.DiffService(),
                        ApplyOpts:      uconfig.ApplyOpts,
                }
                uopts := []unpack.UnpackerOpt{unpack.WithUnpackPlatform(platform)}
                if pullCtx.MaxConcurrentDownloads > 0 {
                        uopts = append(uopts, unpack.WithLimiter(semaphore.NewWeighted(int64(pullCtx.MaxConcurrentDownloads))))
                }
                if uconfig.DuplicationSuppressor != nil {
                        uopts = append(uopts, unpack.WithDuplicationSuppressor(uconfig.DuplicationSuppressor))
                }
                unpacker, err = unpack.NewUnpacker(ctx, c.ContentStore(), uopts...)
                if err != nil {
                        return nil, fmt.Errorf("unable to initialize unpacker: %w", err)
                }
                defer func() {
                        if _, err := unpacker.Wait(); err != nil {
                                if retErr == nil {
                                        retErr = fmt.Errorf("unpack: %w", err)
                                }
                        }
                }()
                wrapper := pullCtx.HandlerWrapper
                pullCtx.HandlerWrapper = func(h images.Handler) images.Handler {
                        if wrapper == nil {
                                return unpacker.Unpack(h)
                        }
                        return unpacker.Unpack(wrapper(h))
                }
        }

        img, err := c.fetch(ctx, pullCtx, ref, 1)
        if err != nil {
                return nil, err
        }

        // NOTE(fuweid): unpacker defers blobs download. before create image
        // record in ImageService, should wait for unpacking(including blobs
        // download).
        var ur unpack.Result
        if unpacker != nil {
                _, unpackSpan := tracing.StartSpan(ctx, tracing.Name(pullSpanPrefix, "UnpackWait"))
                if ur, err = unpacker.Wait(); err != nil {
                        unpackSpan.SetStatus(err)
                        unpackSpan.End()
                        return nil, err
                }
                unpackSpan.End()
        }

        img, err = c.createNewImage(ctx, img)
        if err != nil {
                return nil, err
        }

        i := NewImageWithPlatform(c, img, pullCtx.PlatformMatcher)
        span.SetAttributes(tracing.Attribute("image.ref", i.Name()))

        if unpacker != nil && ur.Unpacks == 0 {
                // Unpack was tried previously but nothing was unpacked
                // This is at least required for schema 1 image.
                if err := i.Unpack(ctx, pullCtx.Snapshotter, pullCtx.UnpackOpts...); err != nil {
                        return nil, fmt.Errorf("failed to unpack image on snapshotter %s: %w", pullCtx.Snapshotter, err)
                }
        }

        return i, nil
}

func (c *Client) fetch(ctx context.Context, rCtx *RemoteContext, ref string, limit int) (images.Image, error) {
        ctx, span := tracing.StartSpan(ctx, tracing.Name(pullSpanPrefix, "fetch"))
        defer span.End()
        store := c.ContentStore()
        name, desc, err := rCtx.Resolver.Resolve(ctx, ref)
        if err != nil {
                return images.Image{}, fmt.Errorf("failed to resolve reference %q: %w", ref, err)
        }

        fetcher, err := rCtx.Resolver.Fetcher(ctx, name)
        if err != nil {
                return images.Image{}, fmt.Errorf("failed to get fetcher for %q: %w", name, err)
        }

        var (
                handler images.Handler

                isConvertible         bool
                originalSchema1Digest string
                converterFunc         func(context.Context, ocispec.Descriptor) (ocispec.Descriptor, error)
                limiter               *semaphore.Weighted
        )

        if desc.MediaType == images.MediaTypeDockerSchema1Manifest && rCtx.ConvertSchema1 {
                schema1Converter, err := schema1.NewConverter(store, fetcher)
                if err != nil {
                        return images.Image{}, fmt.Errorf("failed to get converter for %q: %w", ref, err)
                }

                handler = images.Handlers(append(rCtx.BaseHandlers, schema1Converter)...)

                isConvertible = true

                converterFunc = func(ctx context.Context, _ ocispec.Descriptor) (ocispec.Descriptor, error) {
                        return schema1Converter.Convert(ctx)
                }

                originalSchema1Digest = desc.Digest.String()
        } else {
                // Get all the children for a descriptor
                childrenHandler := images.ChildrenHandler(store)
                // Set any children labels for that content
                childrenHandler = images.SetChildrenMappedLabels(store, childrenHandler, rCtx.ChildLabelMap)
                if rCtx.AllMetadata {
                        // Filter manifests by platforms but allow to handle manifest
                        // and configuration for not-target platforms
                        childrenHandler = remotes.FilterManifestByPlatformHandler(childrenHandler, rCtx.PlatformMatcher)
                } else {
                        // Filter children by platforms if specified.
                        childrenHandler = images.FilterPlatforms(childrenHandler, rCtx.PlatformMatcher)
                }
                // Sort and limit manifests if a finite number is needed
                if limit > 0 {
                        childrenHandler = images.LimitManifests(childrenHandler, rCtx.PlatformMatcher, limit)
                }

                // set isConvertible to true if there is application/octet-stream media type
                convertibleHandler := images.HandlerFunc(
                        func(_ context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                                if desc.MediaType == docker.LegacyConfigMediaType {
                                        isConvertible = true
                                }

                                return []ocispec.Descriptor{}, nil
                        },
                )

                appendDistSrcLabelHandler, err := docker.AppendDistributionSourceLabel(store, ref)
                if err != nil {
                        return images.Image{}, err
                }

                handlers := append(rCtx.BaseHandlers,
                        remotes.FetchHandler(store, fetcher),
                        convertibleHandler,
                        childrenHandler,
                        appendDistSrcLabelHandler,
                )

                handler = images.Handlers(handlers...)

                converterFunc = func(ctx context.Context, desc ocispec.Descriptor) (ocispec.Descriptor, error) {
                        return docker.ConvertManifest(ctx, store, desc)
                }
        }

        if rCtx.HandlerWrapper != nil {
                handler = rCtx.HandlerWrapper(handler)
        }

        if rCtx.MaxConcurrentDownloads > 0 {
                limiter = semaphore.NewWeighted(int64(rCtx.MaxConcurrentDownloads))
        }

        if err := images.Dispatch(ctx, handler, limiter, desc); err != nil {
                return images.Image{}, err
        }

        if isConvertible {
                if desc, err = converterFunc(ctx, desc); err != nil {
                        return images.Image{}, err
                }
        }

        if originalSchema1Digest != "" {
                if rCtx.Labels == nil {
                        rCtx.Labels = make(map[string]string)
                }
                rCtx.Labels[images.ConvertedDockerSchema1LabelKey] = originalSchema1Digest
        }

        return images.Image{
                Name:   name,
                Target: desc,
                Labels: rCtx.Labels,
        }, nil
}

func (c *Client) createNewImage(ctx context.Context, img images.Image) (images.Image, error) {
        ctx, span := tracing.StartSpan(ctx, tracing.Name(pullSpanPrefix, "pull.createNewImage"))
        defer span.End()
        is := c.ImageService()
        for {
                if created, err := is.Create(ctx, img); err != nil {
                        if !errdefs.IsAlreadyExists(err) {
                                return images.Image{}, err
                        }

                        updated, err := is.Update(ctx, img)
                        if err != nil {
                                // if image was removed, try create again
                                if errdefs.IsNotFound(err) {
                                        continue
                                }
                                return images.Image{}, err
                        }

                        img = updated
                } else {
                        img = created
                }

                return img, nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"
        "fmt"
        "time"

        "github.com/containerd/containerd/v2/core/containers"
        api "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
)

// Sandbox is a high level client to containerd's sandboxes.
type Sandbox interface {
        // ID is a sandbox identifier
        ID() string
        // Metadata returns metadata of the sandbox
        Metadata() api.Sandbox
        // NewContainer creates new container that will belong to this sandbox
        NewContainer(ctx context.Context, id string, opts ...NewContainerOpts) (Container, error)
        // Labels returns the labels set on the sandbox
        Labels(ctx context.Context) (map[string]string, error)
        // Start starts new sandbox instance
        Start(ctx context.Context) error
        // Stop sends stop request to the shim instance.
        Stop(ctx context.Context) error
        // Wait blocks until sandbox process exits.
        Wait(ctx context.Context) (<-chan ExitStatus, error)
        // Shutdown removes sandbox from the metadata store and shutdowns shim instance.
        Shutdown(ctx context.Context) error
}

type sandboxClient struct {
        client   *Client
        metadata api.Sandbox
}

func (s *sandboxClient) ID() string {
        return s.metadata.ID
}

func (s *sandboxClient) Metadata() api.Sandbox {
        return s.metadata
}

func (s *sandboxClient) NewContainer(ctx context.Context, id string, opts ...NewContainerOpts) (Container, error) {
        return s.client.NewContainer(ctx, id, append(opts, WithSandbox(s.ID()))...)
}

func (s *sandboxClient) Labels(ctx context.Context) (map[string]string, error) {
        sandbox, err := s.client.SandboxStore().Get(ctx, s.ID())
        if err != nil {
                return nil, err
        }

        return sandbox.Labels, nil
}

func (s *sandboxClient) Start(ctx context.Context) error {
        _, err := s.client.SandboxController(s.metadata.Sandboxer).Start(ctx, s.ID())
        if err != nil {
                return err
        }

        return nil
}

func (s *sandboxClient) Wait(ctx context.Context) (<-chan ExitStatus, error) {
        c := make(chan ExitStatus, 1)
        go func() {
                defer close(c)

                exitStatus, err := s.client.SandboxController(s.metadata.Sandboxer).Wait(ctx, s.ID())
                if err != nil {
                        c <- ExitStatus{
                                code: UnknownExitStatus,
                                err:  err,
                        }
                        return
                }

                c <- ExitStatus{
                        code:     exitStatus.ExitStatus,
                        exitedAt: exitStatus.ExitedAt,
                }
        }()

        return c, nil
}

func (s *sandboxClient) Stop(ctx context.Context) error {
        return s.client.SandboxController(s.metadata.Sandboxer).Stop(ctx, s.ID())
}

func (s *sandboxClient) Shutdown(ctx context.Context) error {
        if err := s.client.SandboxController(s.metadata.Sandboxer).Shutdown(ctx, s.ID()); err != nil && errdefs.IsNotFound(err) {
                return fmt.Errorf("failed to shutdown sandbox: %w", err)
        }

        if err := s.client.SandboxStore().Delete(ctx, s.ID()); err != nil && !errdefs.IsNotFound(err) {
                return fmt.Errorf("failed to delete sandbox from store: %w", err)
        }

        return nil
}

// NewSandbox creates new sandbox client
func (c *Client) NewSandbox(ctx context.Context, sandboxID string, opts ...NewSandboxOpts) (Sandbox, error) {
        if sandboxID == "" {
                return nil, errors.New("sandbox ID must be specified")
        }

        newSandbox := api.Sandbox{
                ID:        sandboxID,
                CreatedAt: time.Now().UTC(),
                UpdatedAt: time.Now().UTC(),
        }

        for _, opt := range opts {
                if err := opt(ctx, c, &newSandbox); err != nil {
                        return nil, err
                }
        }

        metadata, err := c.SandboxStore().Create(ctx, newSandbox)
        if err != nil {
                return nil, err
        }

        return &sandboxClient{
                client:   c,
                metadata: metadata,
        }, nil
}

// LoadSandbox laods existing sandbox metadata object using the id
func (c *Client) LoadSandbox(ctx context.Context, id string) (Sandbox, error) {
        sandbox, err := c.SandboxStore().Get(ctx, id)
        if err != nil {
                return nil, err
        }

        return &sandboxClient{
                client:   c,
                metadata: sandbox,
        }, nil
}

// NewSandboxOpts is a sandbox options and extensions to be provided by client
type NewSandboxOpts func(ctx context.Context, client *Client, sandbox *api.Sandbox) error

// WithSandboxRuntime allows a user to specify the runtime to be used to run a sandbox
func WithSandboxRuntime(name string, options interface{}) NewSandboxOpts {
        return func(ctx context.Context, client *Client, s *api.Sandbox) error {
                if options == nil {
                        options = &types.Empty{}
                }

                opts, err := typeurl.MarshalAny(options)
                if err != nil {
                        return fmt.Errorf("failed to marshal sandbox runtime options: %w", err)
                }

                s.Runtime = api.RuntimeOpts{
                        Name:    name,
                        Options: opts,
                }

                return nil
        }
}

// WithSandboxSpec will provide the sandbox runtime spec
func WithSandboxSpec(s *oci.Spec, opts ...oci.SpecOpts) NewSandboxOpts {
        return func(ctx context.Context, client *Client, sandbox *api.Sandbox) error {
                c := &containers.Container{ID: sandbox.ID}

                if err := oci.ApplyOpts(ctx, client, c, s, opts...); err != nil {
                        return err
                }

                spec, err := typeurl.MarshalAny(s)
                if err != nil {
                        return fmt.Errorf("failed to marshal spec: %w", err)
                }

                sandbox.Spec = spec
                return nil
        }
}

// WithSandboxExtension attaches an extension to sandbox
func WithSandboxExtension(name string, extension interface{}) NewSandboxOpts {
        return func(ctx context.Context, client *Client, s *api.Sandbox) error {
                if s.Extensions == nil {
                        s.Extensions = make(map[string]typeurl.Any)
                }

                ext, err := typeurl.MarshalAny(extension)
                if err != nil {
                        return fmt.Errorf("failed to marshal sandbox extension: %w", err)
                }

                s.Extensions[name] = ext
                return nil
        }
}

// WithSandboxLabels attaches map of labels to sandbox
func WithSandboxLabels(labels map[string]string) NewSandboxOpts {
        return func(ctx context.Context, client *Client, sandbox *api.Sandbox) error {
                sandbox.Labels = labels
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "fmt"

        containersapi "github.com/containerd/containerd/api/services/containers/v1"
        "github.com/containerd/containerd/api/services/diff/v1"
        imagesapi "github.com/containerd/containerd/api/services/images/v1"
        namespacesapi "github.com/containerd/containerd/api/services/namespaces/v1"
        "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/introspection"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/plugins"
        srv "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
)

type services struct {
        contentStore         content.Store
        imageStore           images.Store
        containerStore       containers.Store
        namespaceStore       namespaces.Store
        snapshotters         map[string]snapshots.Snapshotter
        taskService          tasks.TasksClient
        diffService          DiffService
        eventService         EventService
        leasesService        leases.Manager
        introspectionService introspection.Service
        sandboxStore         sandbox.Store
        sandboxers           map[string]sandbox.Controller
}

// ServicesOpt allows callers to set options on the services
type ServicesOpt func(c *services)

// WithContentStore sets the content store.
func WithContentStore(contentStore content.Store) ServicesOpt {
        return func(s *services) {
                s.contentStore = contentStore
        }
}

// WithImageClient sets the image service to use using an images client.
func WithImageClient(imageService imagesapi.ImagesClient) ServicesOpt {
        return func(s *services) {
                s.imageStore = NewImageStoreFromClient(imageService)
        }
}

// WithImageStore sets the image store.
func WithImageStore(imageStore images.Store) ServicesOpt {
        return func(s *services) {
                s.imageStore = imageStore
        }
}

// WithSnapshotters sets the snapshotters.
func WithSnapshotters(snapshotters map[string]snapshots.Snapshotter) ServicesOpt {
        return func(s *services) {
                s.snapshotters = make(map[string]snapshots.Snapshotter)
                for n, sn := range snapshotters {
                        s.snapshotters[n] = sn
                }
        }
}

// WithContainerClient sets the container service to use using a containers client.
func WithContainerClient(containerService containersapi.ContainersClient) ServicesOpt {
        return func(s *services) {
                s.containerStore = NewRemoteContainerStore(containerService)
        }
}

// WithContainerStore sets the container store.
func WithContainerStore(containerStore containers.Store) ServicesOpt {
        return func(s *services) {
                s.containerStore = containerStore
        }
}

// WithTaskClient sets the task service to use from a tasks client.
func WithTaskClient(taskService tasks.TasksClient) ServicesOpt {
        return func(s *services) {
                s.taskService = taskService
        }
}

// WithDiffClient sets the diff service to use from a diff client.
func WithDiffClient(diffService diff.DiffClient) ServicesOpt {
        return func(s *services) {
                s.diffService = NewDiffServiceFromClient(diffService)
        }
}

// WithDiffService sets the diff store.
func WithDiffService(diffService DiffService) ServicesOpt {
        return func(s *services) {
                s.diffService = diffService
        }
}

// WithEventService sets the event service.
func WithEventService(eventService EventService) ServicesOpt {
        return func(s *services) {
                s.eventService = eventService
        }
}

// WithNamespaceClient sets the namespace service using a namespaces client.
func WithNamespaceClient(namespaceService namespacesapi.NamespacesClient) ServicesOpt {
        return func(s *services) {
                s.namespaceStore = NewNamespaceStoreFromClient(namespaceService)
        }
}

// WithNamespaceService sets the namespace service.
func WithNamespaceService(namespaceService namespaces.Store) ServicesOpt {
        return func(s *services) {
                s.namespaceStore = namespaceService
        }
}

// WithLeasesService sets the lease service.
func WithLeasesService(leasesService leases.Manager) ServicesOpt {
        return func(s *services) {
                s.leasesService = leasesService
        }
}

// WithIntrospectionService sets the introspection service.
func WithIntrospectionService(in introspection.Service) ServicesOpt {
        return func(s *services) {
                s.introspectionService = in
        }
}

// WithSandboxStore sets the sandbox store.
func WithSandboxStore(client sandbox.Store) ServicesOpt {
        return func(s *services) {
                s.sandboxStore = client
        }
}

// WithInMemoryServices is suitable for cases when there is need to use containerd's client from
// another (in-memory) containerd plugin (such as CRI).
func WithInMemoryServices(ic *plugin.InitContext) Opt {
        return func(c *clientOpts) error {
                var opts []ServicesOpt
                for t, fn := range map[plugin.Type]func(interface{}) ServicesOpt{
                        plugins.EventPlugin: func(i interface{}) ServicesOpt {
                                return WithEventService(i.(EventService))
                        },
                        plugins.LeasePlugin: func(i interface{}) ServicesOpt {
                                return WithLeasesService(i.(leases.Manager))
                        },
                        plugins.SandboxStorePlugin: func(i interface{}) ServicesOpt {
                                return WithSandboxStore(i.(sandbox.Store))
                        },
                } {
                        i, err := ic.GetSingle(t)
                        if err != nil {
                                return fmt.Errorf("failed to get %q plugin: %w", t, err)
                        }
                        opts = append(opts, fn(i))
                }

                plugins, err := ic.GetByType(plugins.ServicePlugin)
                if err != nil {
                        return fmt.Errorf("failed to get service plugin: %w", err)
                }
                for s, fn := range map[string]func(interface{}) ServicesOpt{
                        srv.ContentService: func(s interface{}) ServicesOpt {
                                return WithContentStore(s.(content.Store))
                        },
                        srv.ImagesService: func(s interface{}) ServicesOpt {
                                return WithImageClient(s.(imagesapi.ImagesClient))
                        },
                        srv.SnapshotsService: func(s interface{}) ServicesOpt {
                                return WithSnapshotters(s.(map[string]snapshots.Snapshotter))
                        },
                        srv.ContainersService: func(s interface{}) ServicesOpt {
                                return WithContainerClient(s.(containersapi.ContainersClient))
                        },
                        srv.TasksService: func(s interface{}) ServicesOpt {
                                return WithTaskClient(s.(tasks.TasksClient))
                        },
                        srv.DiffService: func(s interface{}) ServicesOpt {
                                return WithDiffClient(s.(diff.DiffClient))
                        },
                        srv.NamespacesService: func(s interface{}) ServicesOpt {
                                return WithNamespaceClient(s.(namespacesapi.NamespacesClient))
                        },
                        srv.IntrospectionService: func(s interface{}) ServicesOpt {
                                return WithIntrospectionService(s.(introspection.Service))
                        },
                } {
                        i := plugins[s]
                        if i == nil {
                                return fmt.Errorf("service %q not found", s)
                        }
                        opts = append(opts, fn(i))
                }

                c.services = &services{}
                for _, o := range opts {
                        o(c.services)
                }
                return nil
        }
}

func WithInMemorySandboxControllers(ic *plugin.InitContext) Opt {
        return func(c *clientOpts) error {
                sandboxers, err := ic.GetByType(plugins.SandboxControllerPlugin)
                if err != nil {
                        return err
                }
                sc := make(map[string]sandbox.Controller)
                for name, p := range sandboxers {
                        sc[name] = p.(sandbox.Controller)
                }
                c.services.sandboxers = sc
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "encoding/json"
        "fmt"
        "syscall"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/moby/sys/signal"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"
)

// StopSignalLabel is a well-known containerd label for storing the stop
// signal specified in the OCI image config
const StopSignalLabel = "io.containerd.image.config.stop-signal"

// GetStopSignal retrieves the container stop signal, specified by the
// well-known containerd label (StopSignalLabel)
func GetStopSignal(ctx context.Context, container Container, defaultSignal syscall.Signal) (syscall.Signal, error) {
        labels, err := container.Labels(ctx)
        if err != nil {
                return -1, err
        }

        if stopSignal, ok := labels[StopSignalLabel]; ok {
                return signal.ParseSignal(stopSignal)
        }

        return defaultSignal, nil
}

// GetOCIStopSignal retrieves the stop signal specified in the OCI image config
func GetOCIStopSignal(ctx context.Context, image Image, defaultSignal string) (string, error) {
        _, err := signal.ParseSignal(defaultSignal)
        if err != nil {
                return "", err
        }
        ic, err := image.Config(ctx)
        if err != nil {
                return "", err
        }
        if !images.IsConfigType(ic.MediaType) {
                return "", fmt.Errorf("unknown image config media type %s", ic.MediaType)
        }

        var (
                ociimage v1.Image
                config   v1.ImageConfig
        )
        p, err := content.ReadBlob(ctx, image.ContentStore(), ic)
        if err != nil {
                return "", err
        }

        if err = json.Unmarshal(p, &ociimage); err != nil {
                return "", err
        }
        config = ociimage.Config

        if config.StopSignal == "" {
                return defaultSignal, nil
        }

        return config.StopSignal, nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "fmt"

        "github.com/containerd/containerd/v2/core/snapshots"
)

const (
        capaRemapIDs     = "remap-ids"
        capaOnlyRemapIDs = "only-remap-ids"
)

// WithRemapperLabels creates the labels used by any supporting snapshotter
// to shift the filesystem ownership (user namespace mapping) automatically; currently
// supported by the fuse-overlayfs and overlay snapshotters
func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt {
        return snapshots.WithLabels(map[string]string{
                snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length),
                snapshots.LabelSnapshotGIDMapping: fmt.Sprintf("%d:%d:%d", ctrGID, hostGID, length)})
}

func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) {
        capabs, err := client.GetSnapshotterCapabilities(ctx, snapshotterName)
        if err != nil {
                return "", err
        }

        for _, capab := range capabs {
                if capab == capaRemapIDs {
                        // Snapshotter supports ID remapping, we don't need to do anything.
                        return parent, nil
                }
        }

        var local snapshots.Info
        for _, opt := range opts {
                opt(&local)
        }

        needsRemap := false
        var uidMap, gidMap string

        if value, ok := local.Labels[snapshots.LabelSnapshotUIDMapping]; ok {
                needsRemap = true
                uidMap = value
        }
        if value, ok := local.Labels[snapshots.LabelSnapshotGIDMapping]; ok {
                needsRemap = true
                gidMap = value
        }

        if !needsRemap {
                return parent, nil
        }

        capaOnlyRemap := false
        for _, capa := range capabs {
                if capa == capaOnlyRemapIDs {
                        capaOnlyRemap = true
                }
        }

        if capaOnlyRemap {
                return "", fmt.Errorf("snapshotter %q doesn't support idmap mounts on this host, configure `slow_chown` to allow a slower and expensive fallback", snapshotterName)
        }

        var ctrUID, hostUID, length uint32
        _, err = fmt.Sscanf(uidMap, "%d:%d:%d", &ctrUID, &hostUID, &length)
        if err != nil {
                return "", fmt.Errorf("uidMap unparsable: %w", err)
        }

        var ctrGID, hostGID, lengthGID uint32
        _, err = fmt.Sscanf(gidMap, "%d:%d:%d", &ctrGID, &hostGID, &lengthGID)
        if err != nil {
                return "", fmt.Errorf("gidMap unparsable: %w", err)
        }

        if ctrUID != 0 || ctrGID != 0 {
                return "", fmt.Errorf("Container UID/GID of 0 only supported currently (%d/%d)", ctrUID, ctrGID)
        }

        // TODO(dgl): length isn't taken into account for the intermediate snapshot id.
        usernsID := fmt.Sprintf("%s-%d-%d", parent, hostUID, hostGID)
        if _, err := snapshotter.Stat(ctx, usernsID); err == nil {
                return usernsID, nil
        }
        mounts, err := snapshotter.Prepare(ctx, usernsID+"-remap", parent)
        if err != nil {
                return "", err
        }
        // TODO(dgl): length isn't taken into account here yet either.
        if err := remapRootFS(ctx, mounts, hostUID, hostGID); err != nil {
                snapshotter.Remove(ctx, usernsID+"-remap")
                return "", err
        }
        if err := snapshotter.Commit(ctx, usernsID, usernsID+"-remap"); err != nil {
                return "", err
        }

        return usernsID, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"
        "fmt"
        "io"
        goruntime "runtime"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/api/types/runc/options"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        google_protobuf "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/pkg/rootfs"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
        digest "github.com/opencontainers/go-digest"
        is "github.com/opencontainers/image-spec/specs-go"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"
        specs "github.com/opencontainers/runtime-spec/specs-go"
)

// UnknownExitStatus is returned when containerd is unable to
// determine the exit status of a process. This can happen if the process never starts
// or if an error was encountered when obtaining the exit status, it is set to 255.
const UnknownExitStatus = 255

const (
        checkpointDateFormat = "01-02-2006-15:04:05"
        checkpointNameFormat = "containerd.io/checkpoint/%s:%s"
)

// Status returns process status and exit information
type Status struct {
        // Status of the process
        Status ProcessStatus
        // ExitStatus returned by the process
        ExitStatus uint32
        // ExitedTime is the time at which the process died
        ExitTime time.Time
}

// ProcessInfo provides platform specific process information
type ProcessInfo struct {
        // Pid is the process ID
        Pid uint32
        // Info includes additional process information
        // Info varies by platform
        Info *google_protobuf.Any
}

// ProcessStatus returns a human readable status for the Process representing its current status
type ProcessStatus string

const (
        // Running indicates the process is currently executing
        Running ProcessStatus = "running"
        // Created indicates the process has been created within containerd but the
        // user's defined process has not started
        Created ProcessStatus = "created"
        // Stopped indicates that the process has ran and exited
        Stopped ProcessStatus = "stopped"
        // Paused indicates that the process is currently paused
        Paused ProcessStatus = "paused"
        // Pausing indicates that the process is currently switching from a
        // running state into a paused state
        Pausing ProcessStatus = "pausing"
        // Unknown indicates that we could not determine the status from the runtime
        Unknown ProcessStatus = "unknown"
)

// IOCloseInfo allows specific io pipes to be closed on a process
type IOCloseInfo struct {
        Stdin bool
}

// IOCloserOpts allows the caller to set specific pipes as closed on a process
type IOCloserOpts func(*IOCloseInfo)

// WithStdinCloser closes the stdin of a process
func WithStdinCloser(r *IOCloseInfo) {
        r.Stdin = true
}

// CheckpointTaskInfo allows specific checkpoint information to be set for the task
type CheckpointTaskInfo struct {
        Name string
        // ParentCheckpoint is the digest of a parent checkpoint
        ParentCheckpoint digest.Digest
        // Options hold runtime specific settings for checkpointing a task
        Options interface{}

        runtime string
}

// Runtime name for the container
func (i *CheckpointTaskInfo) Runtime() string {
        return i.runtime
}

// CheckpointTaskOpts allows the caller to set checkpoint options
type CheckpointTaskOpts func(*CheckpointTaskInfo) error

// TaskInfo sets options for task creation
type TaskInfo struct {
        // Checkpoint is the Descriptor for an existing checkpoint that can be used
        // to restore a task's runtime and memory state
        Checkpoint *types.Descriptor
        // RootFS is a list of mounts to use as the task's root filesystem
        RootFS []mount.Mount
        // Options hold runtime specific settings for task creation
        Options interface{}
        // RuntimePath is an absolute path that can be used to overwrite path
        // to a shim runtime binary.
        RuntimePath string

        // runtime is the runtime name for the container, and cannot be changed.
        runtime string
}

// Runtime name for the container
func (i *TaskInfo) Runtime() string {
        return i.runtime
}

// Task is the executable object within containerd
type Task interface {
        Process

        // Pause suspends the execution of the task
        Pause(context.Context) error
        // Resume the execution of the task
        Resume(context.Context) error
        // Exec creates a new process inside the task
        Exec(context.Context, string, *specs.Process, cio.Creator) (Process, error)
        // Pids returns a list of system specific process ids inside the task
        Pids(context.Context) ([]ProcessInfo, error)
        // Checkpoint serializes the runtime and memory information of a task into an
        // OCI Index that can be pushed and pulled from a remote resource.
        //
        // Additional software like CRIU maybe required to checkpoint and restore tasks
        // NOTE: Checkpoint supports to dump task information to a directory, in this way,
        // an empty OCI Index will be returned.
        Checkpoint(context.Context, ...CheckpointTaskOpts) (Image, error)
        // Update modifies executing tasks with updated settings
        Update(context.Context, ...UpdateTaskOpts) error
        // LoadProcess loads a previously created exec'd process
        LoadProcess(context.Context, string, cio.Attach) (Process, error)
        // Metrics returns task metrics for runtime specific metrics
        //
        // The metric types are generic to containerd and change depending on the runtime
        // For the built in Linux runtime, github.com/containerd/cgroups.Metrics
        // are returned in protobuf format
        Metrics(context.Context) (*types.Metric, error)
        // Spec returns the current OCI specification for the task
        Spec(context.Context) (*oci.Spec, error)
}

var _ = (Task)(&task{})

type task struct {
        client *Client
        c      Container

        io  cio.IO
        id  string
        pid uint32
}

// Spec returns the current OCI specification for the task
func (t *task) Spec(ctx context.Context) (*oci.Spec, error) {
        return t.c.Spec(ctx)
}

// ID of the task
func (t *task) ID() string {
        return t.id
}

// Pid returns the pid or process id for the task
func (t *task) Pid() uint32 {
        return t.pid
}

func (t *task) Start(ctx context.Context) error {
        r, err := t.client.TaskService().Start(ctx, &tasks.StartRequest{
                ContainerID: t.id,
        })
        if err != nil {
                if t.io != nil {
                        t.io.Cancel()
                        t.io.Close()
                }
                return errdefs.FromGRPC(err)
        }
        t.pid = r.Pid
        return nil
}

func (t *task) Kill(ctx context.Context, s syscall.Signal, opts ...KillOpts) error {
        var i KillInfo
        for _, o := range opts {
                if err := o(ctx, &i); err != nil {
                        return err
                }
        }
        _, err := t.client.TaskService().Kill(ctx, &tasks.KillRequest{
                Signal:      uint32(s),
                ContainerID: t.id,
                ExecID:      i.ExecID,
                All:         i.All,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (t *task) Pause(ctx context.Context) error {
        _, err := t.client.TaskService().Pause(ctx, &tasks.PauseTaskRequest{
                ContainerID: t.id,
        })
        return errdefs.FromGRPC(err)
}

func (t *task) Resume(ctx context.Context) error {
        _, err := t.client.TaskService().Resume(ctx, &tasks.ResumeTaskRequest{
                ContainerID: t.id,
        })
        return errdefs.FromGRPC(err)
}

func (t *task) Status(ctx context.Context) (Status, error) {
        r, err := t.client.TaskService().Get(ctx, &tasks.GetRequest{
                ContainerID: t.id,
        })
        if err != nil {
                return Status{}, errdefs.FromGRPC(err)
        }
        return Status{
                Status:     ProcessStatus(strings.ToLower(r.Process.Status.String())),
                ExitStatus: r.Process.ExitStatus,
                ExitTime:   protobuf.FromTimestamp(r.Process.ExitedAt),
        }, nil
}

func (t *task) Wait(ctx context.Context) (<-chan ExitStatus, error) {
        c := make(chan ExitStatus, 1)
        go func() {
                defer close(c)
                r, err := t.client.TaskService().Wait(ctx, &tasks.WaitRequest{
                        ContainerID: t.id,
                })
                if err != nil {
                        c <- ExitStatus{
                                code: UnknownExitStatus,
                                err:  err,
                        }
                        return
                }
                c <- ExitStatus{
                        code:     r.ExitStatus,
                        exitedAt: protobuf.FromTimestamp(r.ExitedAt),
                }
        }()
        return c, nil
}

// Delete deletes the task and its runtime state
// it returns the exit status of the task and any errors that were encountered
// during cleanup
func (t *task) Delete(ctx context.Context, opts ...ProcessDeleteOpts) (*ExitStatus, error) {
        for _, o := range opts {
                if err := o(ctx, t); err != nil {
                        return nil, err
                }
        }
        status, err := t.Status(ctx)
        if err != nil && errdefs.IsNotFound(err) {
                return nil, err
        }
        switch status.Status {
        case Stopped, Unknown, "":
        case Created:
                if t.client.runtime == plugins.RuntimePlugin.String()+".windows" {
                        // On windows Created is akin to Stopped
                        break
                }
                if t.pid == 0 {
                        // allow for deletion of created tasks with PID 0
                        // https://github.com/containerd/containerd/issues/7357
                        break
                }
                fallthrough
        default:
                return nil, fmt.Errorf("task must be stopped before deletion: %s: %w", status.Status, errdefs.ErrFailedPrecondition)
        }
        if t.io != nil {
                // io.Wait locks for restored tasks on Windows unless we call
                // io.Close first (https://github.com/containerd/containerd/issues/5621)
                // in other cases, preserve the contract and let IO finish before closing
                if t.client.runtime == plugins.RuntimePlugin.String()+".windows" {
                        t.io.Close()
                }
                // io.Cancel is used to cancel the io goroutine while it is in
                // fifo-opening state. It does not stop the pipes since these
                // should be closed on the shim's side, otherwise we might lose
                // data from the container!
                t.io.Cancel()
                t.io.Wait()
        }
        r, err := t.client.TaskService().Delete(ctx, &tasks.DeleteTaskRequest{
                ContainerID: t.id,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        // Only cleanup the IO after a successful Delete
        if t.io != nil {
                t.io.Close()
        }
        return &ExitStatus{code: r.ExitStatus, exitedAt: protobuf.FromTimestamp(r.ExitedAt)}, nil
}

func (t *task) Exec(ctx context.Context, id string, spec *specs.Process, ioCreate cio.Creator) (_ Process, err error) {
        if id == "" {
                return nil, fmt.Errorf("exec id must not be empty: %w", errdefs.ErrInvalidArgument)
        }
        i, err := ioCreate(id)
        if err != nil {
                return nil, err
        }
        defer func() {
                if err != nil && i != nil {
                        i.Cancel()
                        i.Close()
                }
        }()
        pSpec, err := protobuf.MarshalAnyToProto(spec)
        if err != nil {
                return nil, err
        }
        cfg := i.Config()
        request := &tasks.ExecProcessRequest{
                ContainerID: t.id,
                ExecID:      id,
                Terminal:    cfg.Terminal,
                Stdin:       cfg.Stdin,
                Stdout:      cfg.Stdout,
                Stderr:      cfg.Stderr,
                Spec:        pSpec,
        }
        if _, err := t.client.TaskService().Exec(ctx, request); err != nil {
                i.Cancel()
                i.Wait()
                i.Close()
                return nil, errdefs.FromGRPC(err)
        }
        return &process{
                id:   id,
                task: t,
                io:   i,
        }, nil
}

func (t *task) Pids(ctx context.Context) ([]ProcessInfo, error) {
        response, err := t.client.TaskService().ListPids(ctx, &tasks.ListPidsRequest{
                ContainerID: t.id,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        var processList []ProcessInfo
        for _, p := range response.Processes {
                processList = append(processList, ProcessInfo{
                        Pid:  p.Pid,
                        Info: p.Info,
                })
        }
        return processList, nil
}

func (t *task) CloseIO(ctx context.Context, opts ...IOCloserOpts) error {
        r := &tasks.CloseIORequest{
                ContainerID: t.id,
        }
        var i IOCloseInfo
        for _, o := range opts {
                o(&i)
        }
        r.Stdin = i.Stdin
        _, err := t.client.TaskService().CloseIO(ctx, r)
        return errdefs.FromGRPC(err)
}

func (t *task) IO() cio.IO {
        return t.io
}

func (t *task) Resize(ctx context.Context, w, h uint32) error {
        _, err := t.client.TaskService().ResizePty(ctx, &tasks.ResizePtyRequest{
                ContainerID: t.id,
                Width:       w,
                Height:      h,
        })
        return errdefs.FromGRPC(err)
}

// NOTE: Checkpoint supports to dump task information to a directory, in this way, an empty
// OCI Index will be returned.
func (t *task) Checkpoint(ctx context.Context, opts ...CheckpointTaskOpts) (Image, error) {
        ctx, done, err := t.client.WithLease(ctx)
        if err != nil {
                return nil, err
        }
        defer done(ctx)
        cr, err := t.client.ContainerService().Get(ctx, t.id)
        if err != nil {
                return nil, err
        }

        request := &tasks.CheckpointTaskRequest{
                ContainerID: t.id,
        }
        i := CheckpointTaskInfo{
                runtime: cr.Runtime.Name,
        }
        for _, o := range opts {
                if err := o(&i); err != nil {
                        return nil, err
                }
        }
        // set a default name
        if i.Name == "" {
                i.Name = fmt.Sprintf(checkpointNameFormat, t.id, time.Now().Format(checkpointDateFormat))
        }
        request.ParentCheckpoint = i.ParentCheckpoint.String()
        if i.Options != nil {
                o, err := protobuf.MarshalAnyToProto(i.Options)
                if err != nil {
                        return nil, err
                }
                request.Options = o
        }

        status, err := t.Status(ctx)
        if err != nil {
                return nil, err
        }

        if status.Status != Paused {
                // make sure we pause it and resume after all other filesystem operations are completed
                if err := t.Pause(ctx); err != nil {
                        return nil, err
                }
                defer t.Resume(ctx)
        }

        index := v1.Index{
                Versioned: is.Versioned{
                        SchemaVersion: 2,
                },
                Annotations: make(map[string]string),
        }
        if err := t.checkpointTask(ctx, &index, request); err != nil {
                return nil, err
        }
        // if checkpoint image path passed, jump checkpoint image,
        // return an empty image
        if isCheckpointPathExist(cr.Runtime.Name, i.Options) {
                return NewImage(t.client, images.Image{}), nil
        }

        if cr.Image != "" {
                if err := t.checkpointImage(ctx, &index, cr.Image); err != nil {
                        return nil, err
                }
                index.Annotations["image.name"] = cr.Image
        }
        if cr.SnapshotKey != "" {
                if err := t.checkpointRWSnapshot(ctx, &index, cr.Snapshotter, cr.SnapshotKey); err != nil {
                        return nil, err
                }
        }
        desc, err := writeIndex(ctx, &index, t.client, t.id)
        if err != nil {
                return nil, err
        }
        im := images.Image{
                Name:   i.Name,
                Target: desc,
                Labels: map[string]string{
                        "containerd.io/checkpoint": "true",
                },
        }
        if im, err = t.client.ImageService().Create(ctx, im); err != nil {
                return nil, err
        }
        return NewImage(t.client, im), nil
}

// UpdateTaskInfo allows updated specific settings to be changed on a task
type UpdateTaskInfo struct {
        // Resources updates a tasks resource constraints
        Resources interface{}
        // Annotations allows arbitrary and/or experimental resource constraints for task update
        Annotations map[string]string
}

// UpdateTaskOpts allows a caller to update task settings
type UpdateTaskOpts func(context.Context, *Client, *UpdateTaskInfo) error

func (t *task) Update(ctx context.Context, opts ...UpdateTaskOpts) error {
        request := &tasks.UpdateTaskRequest{
                ContainerID: t.id,
        }
        var i UpdateTaskInfo
        for _, o := range opts {
                if err := o(ctx, t.client, &i); err != nil {
                        return err
                }
        }
        if i.Resources != nil {
                r, err := typeurl.MarshalAny(i.Resources)
                if err != nil {
                        return err
                }
                request.Resources = protobuf.FromAny(r)
        }
        if i.Annotations != nil {
                request.Annotations = i.Annotations
        }
        _, err := t.client.TaskService().Update(ctx, request)
        return errdefs.FromGRPC(err)
}

func (t *task) LoadProcess(ctx context.Context, id string, ioAttach cio.Attach) (Process, error) {
        if id == t.id && ioAttach == nil {
                return t, nil
        }
        response, err := t.client.TaskService().Get(ctx, &tasks.GetRequest{
                ContainerID: t.id,
                ExecID:      id,
        })
        if err != nil {
                err = errdefs.FromGRPC(err)
                if errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("no running process found: %w", err)
                }
                return nil, err
        }
        var i cio.IO
        if ioAttach != nil {
                if i, err = attachExistingIO(response, ioAttach); err != nil {
                        return nil, err
                }
        }
        return &process{
                id:   id,
                task: t,
                io:   i,
        }, nil
}

func (t *task) Metrics(ctx context.Context) (*types.Metric, error) {
        response, err := t.client.TaskService().Metrics(ctx, &tasks.MetricsRequest{
                Filters: []string{
                        "id==" + t.id,
                },
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        if response.Metrics == nil {
                _, err := t.Status(ctx)
                if err != nil && errdefs.IsNotFound(err) {
                        return nil, err
                }
                return nil, errors.New("no metrics received")
        }

        return response.Metrics[0], nil
}

func (t *task) checkpointTask(ctx context.Context, index *v1.Index, request *tasks.CheckpointTaskRequest) error {
        response, err := t.client.TaskService().Checkpoint(ctx, request)
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        // NOTE: response.Descriptors can be an empty slice if checkpoint image is jumped
        // add the checkpoint descriptors to the index
        for _, d := range response.Descriptors {
                index.Manifests = append(index.Manifests, v1.Descriptor{
                        MediaType: d.MediaType,
                        Size:      d.Size,
                        Digest:    digest.Digest(d.Digest),
                        Platform: &v1.Platform{
                                OS:           goruntime.GOOS,
                                Architecture: goruntime.GOARCH,
                        },
                        Annotations: d.Annotations,
                })
        }
        return nil
}

func (t *task) checkpointRWSnapshot(ctx context.Context, index *v1.Index, snapshotterName string, id string) error {
        opts := []diff.Opt{
                diff.WithReference(fmt.Sprintf("checkpoint-rw-%s", id)),
        }
        rw, err := rootfs.CreateDiff(ctx, id, t.client.SnapshotService(snapshotterName), t.client.DiffService(), opts...)
        if err != nil {
                return err
        }
        rw.Platform = &v1.Platform{
                OS:           goruntime.GOOS,
                Architecture: goruntime.GOARCH,
        }
        index.Manifests = append(index.Manifests, rw)
        return nil
}

func (t *task) checkpointImage(ctx context.Context, index *v1.Index, image string) error {
        if image == "" {
                return fmt.Errorf("cannot checkpoint image with empty name")
        }
        ir, err := t.client.ImageService().Get(ctx, image)
        if err != nil {
                return err
        }
        index.Manifests = append(index.Manifests, ir.Target)
        return nil
}

func writeContent(ctx context.Context, store content.Ingester, mediaType, ref string, r io.Reader, opts ...content.Opt) (d v1.Descriptor, err error) {
        writer, err := store.Writer(ctx, content.WithRef(ref))
        if err != nil {
                return d, err
        }
        defer writer.Close()
        size, err := io.Copy(writer, r)
        if err != nil {
                return d, err
        }

        if err := writer.Commit(ctx, size, "", opts...); err != nil {
                if !errdefs.IsAlreadyExists(err) {
                        return d, err
                }
        }
        return v1.Descriptor{
                MediaType: mediaType,
                Digest:    writer.Digest(),
                Size:      size,
        }, nil
}

// isCheckpointPathExist only suitable for runc runtime now
func isCheckpointPathExist(runtime string, v interface{}) bool {
        if v == nil {
                return false
        }

        switch runtime {
        case plugins.RuntimeRuncV2:
                if opts, ok := v.(*options.CheckpointOptions); ok && opts.ImagePath != "" {
                        return true
                }
        }

        return false
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"
        "fmt"
        "syscall"

        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/api/types/runc/options"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/errdefs"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// NewTaskOpts allows the caller to set options on a new task
type NewTaskOpts func(context.Context, *Client, *TaskInfo) error

// WithRootFS allows a task to be created without a snapshot being allocated to its container
func WithRootFS(mounts []mount.Mount) NewTaskOpts {
        return func(ctx context.Context, c *Client, ti *TaskInfo) error {
                ti.RootFS = mounts
                return nil
        }
}

// WithRuntimePath will force task service to use a custom path to the runtime binary
// instead of resolving it from runtime name.
func WithRuntimePath(absRuntimePath string) NewTaskOpts {
        return func(ctx context.Context, client *Client, info *TaskInfo) error {
                info.RuntimePath = absRuntimePath
                return nil
        }
}

// WithTaskAPIEndpoint allow task service to manage a task through a given endpoint,
// usually it is served inside a sandbox, and we can get it from sandbox status.
func WithTaskAPIEndpoint(address string, version uint32) NewTaskOpts {
        return func(ctx context.Context, client *Client, info *TaskInfo) error {
                if info.Options == nil {
                        info.Options = &options.Options{}
                }
                opts, ok := info.Options.(*options.Options)
                if !ok {
                        return errors.New("invalid runtime v2 options format")
                }
                opts.TaskApiAddress = address
                opts.TaskApiVersion = version
                return nil
        }
}

// WithTaskCheckpoint allows a task to be created with live runtime and memory data from a
// previous checkpoint. Additional software such as CRIU may be required to
// restore a task from a checkpoint
func WithTaskCheckpoint(im Image) NewTaskOpts {
        return func(ctx context.Context, c *Client, info *TaskInfo) error {
                desc := im.Target()
                id := desc.Digest
                index, err := decodeIndex(ctx, c.ContentStore(), desc)
                if err != nil {
                        return err
                }
                for _, m := range index.Manifests {
                        if m.MediaType == images.MediaTypeContainerd1Checkpoint {
                                info.Checkpoint = &types.Descriptor{
                                        MediaType:   m.MediaType,
                                        Size:        m.Size,
                                        Digest:      m.Digest.String(),
                                        Annotations: m.Annotations,
                                }
                                return nil
                        }
                }
                return fmt.Errorf("checkpoint not found in index %s", id)
        }
}

// WithCheckpointName sets the image name for the checkpoint
func WithCheckpointName(name string) CheckpointTaskOpts {
        return func(r *CheckpointTaskInfo) error {
                r.Name = name
                return nil
        }
}

// WithCheckpointImagePath sets image path for checkpoint option
func WithCheckpointImagePath(path string) CheckpointTaskOpts {
        return func(r *CheckpointTaskInfo) error {
                if r.Options == nil {
                        r.Options = &options.CheckpointOptions{}
                }
                opts, ok := r.Options.(*options.CheckpointOptions)
                if !ok {
                        return errors.New("invalid runtime v2 checkpoint options format")
                }
                opts.ImagePath = path
                return nil
        }
}

// WithRestoreImagePath sets image path for create option
func WithRestoreImagePath(path string) NewTaskOpts {
        return func(ctx context.Context, c *Client, ti *TaskInfo) error {
                if ti.Options == nil {
                        ti.Options = &options.Options{}
                }
                opts, ok := ti.Options.(*options.Options)
                if !ok {
                        return errors.New("invalid runtime v2 options format")
                }
                opts.CriuImagePath = path
                return nil
        }
}

// WithRestoreWorkPath sets criu work path for create option
func WithRestoreWorkPath(path string) NewTaskOpts {
        return func(ctx context.Context, c *Client, ti *TaskInfo) error {
                if ti.Options == nil {
                        ti.Options = &options.Options{}
                }
                opts, ok := ti.Options.(*options.Options)
                if !ok {
                        return errors.New("invalid runtime v2 options format")
                }
                opts.CriuWorkPath = path
                return nil
        }
}

// ProcessDeleteOpts allows the caller to set options for the deletion of a task
type ProcessDeleteOpts func(context.Context, Process) error

// WithProcessKill will forcefully kill and delete a process
func WithProcessKill(ctx context.Context, p Process) error {
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()
        // ignore errors to wait and kill as we are forcefully killing
        // the process and don't care about the exit status
        s, err := p.Wait(ctx)
        if err != nil {
                return err
        }
        if err := p.Kill(ctx, syscall.SIGKILL, WithKillAll); err != nil {
                // Kill might still return an IsNotFound error, even if it actually
                // killed the process.
                if errdefs.IsNotFound(err) {
                        select {
                        case <-ctx.Done():
                                return ctx.Err()
                        case <-s:
                                return nil
                        }
                }
                if errdefs.IsFailedPrecondition(err) {
                        return nil
                }
                return err
        }
        // wait for the process to fully stop before letting the rest of the deletion complete
        <-s
        return nil
}

// KillInfo contains information on how to process a Kill action
type KillInfo struct {
        // All kills all processes inside the task
        // only valid on tasks, ignored on processes
        All bool
        // ExecID is the ID of a process to kill
        ExecID string
}

// KillOpts allows options to be set for the killing of a process
type KillOpts func(context.Context, *KillInfo) error

// WithKillAll kills all processes for a task
func WithKillAll(ctx context.Context, i *KillInfo) error {
        i.All = true
        return nil
}

// WithKillExecID specifies the process ID
func WithKillExecID(execID string) KillOpts {
        return func(ctx context.Context, i *KillInfo) error {
                i.ExecID = execID
                return nil
        }
}

// WithResources sets the provided resources for task updates. Resources must be
// either a *specs.LinuxResources or a *specs.WindowsResources
func WithResources(resources interface{}) UpdateTaskOpts {
        return func(ctx context.Context, client *Client, r *UpdateTaskInfo) error {
                switch resources.(type) {
                case *specs.LinuxResources:
                case *specs.WindowsResources:
                default:
                        return errors.New("WithResources requires a *specs.LinuxResources or *specs.WindowsResources")
                }

                r.Resources = resources
                return nil
        }
}

// WithAnnotations sets the provided annotations for task updates.
func WithAnnotations(annotations map[string]string) UpdateTaskOpts {
        return func(ctx context.Context, client *Client, r *UpdateTaskInfo) error {
                r.Annotations = annotations
                return nil
        }
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"
        "errors"

        "github.com/containerd/containerd/api/types/runc/options"
)

// WithNoNewKeyring causes tasks not to be created with a new keyring for secret storage.
// There is an upper limit on the number of keyrings in a linux system
func WithNoNewKeyring(ctx context.Context, c *Client, ti *TaskInfo) error {
        if ti.Options == nil {
                ti.Options = &options.Options{}
        }
        opts, ok := ti.Options.(*options.Options)
        if !ok {
                return errors.New("invalid v2 shim create options format")
        }
        opts.NoNewKeyring = true
        return nil
}

// WithNoPivotRoot instructs the runtime not to you pivot_root
func WithNoPivotRoot(_ context.Context, _ *Client, ti *TaskInfo) error {
        if ti.Options == nil {
                ti.Options = &options.Options{}
        }
        opts, ok := ti.Options.(*options.Options)
        if !ok {
                return errors.New("invalid v2 shim create options format")
        }
        opts.NoPivotRoot = true
        return nil
}

// WithShimCgroup sets the existing cgroup for the shim
func WithShimCgroup(path string) NewTaskOpts {
        return func(ctx context.Context, c *Client, ti *TaskInfo) error {
                if ti.Options == nil {
                        ti.Options = &options.Options{}
                }
                opts, ok := ti.Options.(*options.Options)
                if !ok {
                        return errors.New("invalid v2 shim create options format")
                }
                opts.ShimCgroup = path
                return nil
        }
}

// WithUIDOwner allows console I/O to work with the remapped UID in user namespace
func WithUIDOwner(uid uint32) NewTaskOpts {
        return func(ctx context.Context, c *Client, ti *TaskInfo) error {
                if ti.Options == nil {
                        ti.Options = &options.Options{}
                }
                opts, ok := ti.Options.(*options.Options)
                if !ok {
                        return errors.New("invalid v2 shim create options format")
                }
                opts.IoUid = uid
                return nil
        }
}

// WithGIDOwner allows console I/O to work with the remapped GID in user namespace
func WithGIDOwner(gid uint32) NewTaskOpts {
        return func(ctx context.Context, c *Client, ti *TaskInfo) error {
                if ti.Options == nil {
                        ti.Options = &options.Options{}
                }
                opts, ok := ti.Options.(*options.Options)
                if !ok {
                        return errors.New("invalid v2 shim create options format")
                }
                opts.IoGid = gid
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package client

import (
        "context"

        "github.com/containerd/containerd/v2/core/streaming"
        streamproxy "github.com/containerd/containerd/v2/core/streaming/proxy"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/containerd/v2/core/transfer/proxy"
)

func (c *Client) Transfer(ctx context.Context, src interface{}, dest interface{}, opts ...transfer.Opt) error {
        ctx, done, err := c.WithLease(ctx)
        if err != nil {
                return err
        }
        defer done(ctx)

        return proxy.NewTransferrer(c.conn, c.streamCreator()).Transfer(ctx, src, dest, opts...)
}

func (c *Client) streamCreator() streaming.StreamCreator {
        return streamproxy.NewStreamCreator(c.conn)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// config is the global configuration for containerd
//
// Version History
// 1: Deprecated and removed in containerd 2.0
// 2: Uses fully qualified plugin names
// 3: Added support for migration and warning on unknown fields
package config

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "reflect"
        "strings"

        "dario.cat/mergo"
        "github.com/pelletier/go-toml/v2"

        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
)

// migrations hold the migration functions for every prior containerd config version
var migrations = []func(context.Context, *Config) error{
        nil,       // Version 0 is not defined, treated at version 1
        v1Migrate, // Version 1 plugins renamed to URI for version 2
        nil,       // Version 2 has only plugin changes to version 3
}

// NOTE: Any new map fields added also need to be handled in mergeConfig.

// Config provides containerd configuration data for the server
type Config struct {
        // Version of the config file
        Version int `toml:"version"`
        // Root is the path to a directory where containerd will store persistent data
        Root string `toml:"root"`
        // State is the path to a directory where containerd will store transient data
        State string `toml:"state"`
        // TempDir is the path to a directory where to place containerd temporary files
        TempDir string `toml:"temp"`
        // PluginDir is the directory for dynamic plugins to be stored
        //
        // Deprecated: Please use proxy or binary external plugins.
        PluginDir string `toml:"plugin_dir"`
        // GRPC configuration settings
        GRPC GRPCConfig `toml:"grpc"`
        // TTRPC configuration settings
        TTRPC TTRPCConfig `toml:"ttrpc"`
        // Debug and profiling settings
        Debug Debug `toml:"debug"`
        // Metrics and monitoring settings
        Metrics MetricsConfig `toml:"metrics"`
        // DisabledPlugins are IDs of plugins to disable. Disabled plugins won't be
        // initialized and started.
        // DisabledPlugins must use a fully qualified plugin URI.
        DisabledPlugins []string `toml:"disabled_plugins"`
        // RequiredPlugins are IDs of required plugins. Containerd exits if any
        // required plugin doesn't exist or fails to be initialized or started.
        // RequiredPlugins must use a fully qualified plugin URI.
        RequiredPlugins []string `toml:"required_plugins"`
        // Plugins provides plugin specific configuration for the initialization of a plugin
        Plugins map[string]interface{} `toml:"plugins"`
        // OOMScore adjust the containerd's oom score
        OOMScore int `toml:"oom_score"`
        // Cgroup specifies cgroup information for the containerd daemon process
        Cgroup CgroupConfig `toml:"cgroup"`
        // ProxyPlugins configures plugins which are communicated to over GRPC
        ProxyPlugins map[string]ProxyPlugin `toml:"proxy_plugins"`
        // Timeouts specified as a duration
        Timeouts map[string]string `toml:"timeouts"`
        // Imports are additional file path list to config files that can overwrite main config file fields
        Imports []string `toml:"imports"`
        // StreamProcessors configuration
        StreamProcessors map[string]StreamProcessor `toml:"stream_processors"`
}

// StreamProcessor provides configuration for diff content processors
type StreamProcessor struct {
        // Accepts specific media-types
        Accepts []string `toml:"accepts"`
        // Returns the media-type
        Returns string `toml:"returns"`
        // Path or name of the binary
        Path string `toml:"path"`
        // Args to the binary
        Args []string `toml:"args"`
        // Environment variables for the binary
        Env []string `toml:"env"`
}

// ValidateVersion validates the config for a v2 file
func (c *Config) ValidateVersion() error {
        if c.Version > version.ConfigVersion {
                return fmt.Errorf("expected containerd config version equal to or less than `%d`, got `%d`", version.ConfigVersion, c.Version)
        }

        for _, p := range c.DisabledPlugins {
                if !strings.ContainsAny(p, ".") {
                        return fmt.Errorf("invalid disabled plugin URI %q expect io.containerd.x.vx", p)
                }
        }
        for _, p := range c.RequiredPlugins {
                if !strings.ContainsAny(p, ".") {
                        return fmt.Errorf("invalid required plugin URI %q expect io.containerd.x.vx", p)
                }
        }

        return nil
}

// MigrateConfig will convert the config to the latest version before using
func (c *Config) MigrateConfig(ctx context.Context) error {
        for c.Version < version.ConfigVersion {
                if m := migrations[c.Version]; m != nil {
                        if err := m(ctx, c); err != nil {
                                return err
                        }
                }
                c.Version++
        }
        return nil
}

func v1Migrate(ctx context.Context, c *Config) error {
        plugins := make(map[string]interface{}, len(c.Plugins))

        // corePlugins is the list of used plugins before v1 was deprecated
        corePlugins := map[string]string{
                "cri":       "io.containerd.grpc.v1.cri",
                "cgroups":   "io.containerd.monitor.v1.cgroups",
                "linux":     "io.containerd.runtime.v1.linux",
                "scheduler": "io.containerd.gc.v1.scheduler",
                "bolt":      "io.containerd.metadata.v1.bolt",
                "task":      "io.containerd.runtime.v2.task",
                "opt":       "io.containerd.internal.v1.opt",
                "restart":   "io.containerd.internal.v1.restart",
                "tracing":   "io.containerd.internal.v1.tracing",
                "otlp":      "io.containerd.tracing.processor.v1.otlp",
                "aufs":      "io.containerd.snapshotter.v1.aufs",
                "btrfs":     "io.containerd.snapshotter.v1.btrfs",
                "devmapper": "io.containerd.snapshotter.v1.devmapper",
                "native":    "io.containerd.snapshotter.v1.native",
                "overlayfs": "io.containerd.snapshotter.v1.overlayfs",
                "zfs":       "io.containerd.snapshotter.v1.zfs",
        }
        for plugin, value := range c.Plugins {
                if !strings.ContainsAny(plugin, ".") {
                        var ambiguous string
                        if full, ok := corePlugins[plugin]; ok {
                                plugin = full
                        } else if strings.HasSuffix(plugin, "-service") {
                                plugin = "io.containerd.service.v1." + plugin
                        } else if plugin == "windows" || plugin == "windows-lcow" {
                                // runtime, differ, and snapshotter plugins do not have configs for v1
                                ambiguous = plugin
                                plugin = "io.containerd.snapshotter.v1." + plugin
                        } else {
                                ambiguous = plugin
                                plugin = "io.containerd.grpc.v1." + plugin
                        }
                        if ambiguous != "" {
                                log.G(ctx).Warnf("Ambiguous %s plugin in v1 config, treating as %s", ambiguous, plugin)
                        }
                }
                plugins[plugin] = value
        }
        c.Plugins = plugins
        return nil
}

// GRPCConfig provides GRPC configuration for the socket
type GRPCConfig struct {
        Address        string `toml:"address"`
        TCPAddress     string `toml:"tcp_address"`
        TCPTLSCA       string `toml:"tcp_tls_ca"`
        TCPTLSCert     string `toml:"tcp_tls_cert"`
        TCPTLSKey      string `toml:"tcp_tls_key"`
        UID            int    `toml:"uid"`
        GID            int    `toml:"gid"`
        MaxRecvMsgSize int    `toml:"max_recv_message_size"`
        MaxSendMsgSize int    `toml:"max_send_message_size"`
}

// TTRPCConfig provides TTRPC configuration for the socket
type TTRPCConfig struct {
        Address string `toml:"address"`
        UID     int    `toml:"uid"`
        GID     int    `toml:"gid"`
}

// Debug provides debug configuration
type Debug struct {
        Address string `toml:"address"`
        UID     int    `toml:"uid"`
        GID     int    `toml:"gid"`
        Level   string `toml:"level"`
        // Format represents the logging format. Supported values are 'text' and 'json'.
        Format string `toml:"format"`
}

// MetricsConfig provides metrics configuration
type MetricsConfig struct {
        Address       string `toml:"address"`
        GRPCHistogram bool   `toml:"grpc_histogram"`
}

// CgroupConfig provides cgroup configuration
type CgroupConfig struct {
        Path string `toml:"path"`
}

// ProxyPlugin provides a proxy plugin configuration
type ProxyPlugin struct {
        Type     string            `toml:"type"`
        Address  string            `toml:"address"`
        Platform string            `toml:"platform"`
        Exports  map[string]string `toml:"exports"`
}

// Decode unmarshals a plugin specific configuration by plugin id
func (c *Config) Decode(ctx context.Context, id string, config interface{}) (interface{}, error) {
        data, ok := c.Plugins[id]
        if !ok {
                return config, nil
        }

        b, err := toml.Marshal(data)
        if err != nil {
                return nil, err
        }

        if err := toml.NewDecoder(bytes.NewReader(b)).DisallowUnknownFields().Decode(config); err != nil {
                var serr *toml.StrictMissingError
                if errors.As(err, &serr) {
                        for _, derr := range serr.Errors {
                                log.G(ctx).WithFields(log.Fields{
                                        "plugin": id,
                                        "key":    strings.Join(derr.Key(), " "),
                                }).WithError(err).Warn("Ignoring unknown key in TOML for plugin")
                        }
                        err = toml.Unmarshal(b, config)
                }
                if err != nil {
                        return nil, err
                }

        }

        return config, nil
}

// LoadConfig loads the containerd server config from the provided path
func LoadConfig(ctx context.Context, path string, out *Config) error {
        if out == nil {
                return fmt.Errorf("argument out must not be nil: %w", errdefs.ErrInvalidArgument)
        }

        var (
                loaded  = map[string]bool{}
                pending = []string{path}
        )

        for len(pending) > 0 {
                path, pending = pending[0], pending[1:]

                // Check if a file at the given path already loaded to prevent circular imports
                if _, ok := loaded[path]; ok {
                        continue
                }

                config, err := loadConfigFile(ctx, path)
                if err != nil {
                        return err
                }

                if err := mergeConfig(out, config); err != nil {
                        return err
                }

                imports, err := resolveImports(path, config.Imports)
                if err != nil {
                        return err
                }

                loaded[path] = true
                pending = append(pending, imports...)
        }

        err := out.ValidateVersion()
        if err != nil {
                return fmt.Errorf("failed to load TOML from %s: %w", path, err)
        }
        return nil
}

// loadConfigFile decodes a TOML file at the given path
func loadConfigFile(ctx context.Context, path string) (*Config, error) {
        config := &Config{}

        f, err := os.Open(path)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        if err := toml.NewDecoder(f).DisallowUnknownFields().Decode(config); err != nil {
                var serr *toml.StrictMissingError
                if errors.As(err, &serr) {
                        for _, derr := range serr.Errors {
                                row, col := derr.Position()
                                log.G(ctx).WithFields(log.Fields{
                                        "file":   path,
                                        "row":    row,
                                        "column": col,
                                        "key":    strings.Join(derr.Key(), " "),
                                }).WithError(err).Warn("Ignoring unknown key in TOML")
                        }

                        // Try decoding again with unknown fields
                        config = &Config{}
                        if _, seekerr := f.Seek(0, io.SeekStart); seekerr != nil {
                                return nil, fmt.Errorf("unable to seek file to start %w: failed to unmarshal TOML with unknown fields: %w", seekerr, err)
                        }
                        err = toml.NewDecoder(f).Decode(config)
                }
                if err != nil {
                        var derr *toml.DecodeError
                        if errors.As(err, &derr) {
                                row, column := derr.Position()
                                log.G(ctx).WithFields(log.Fields{
                                        "file":   path,
                                        "row":    row,
                                        "column": column,
                                }).WithError(err).Error("Failure unmarshaling TOML")
                                return nil, fmt.Errorf("failed to unmarshal TOML at row %d column %d: %w", row, column, err)
                        }
                        return nil, fmt.Errorf("failed to unmarshal TOML: %w", err)
                }

        }

        return config, nil
}

// resolveImports resolves import strings list to absolute paths list:
// - If path contains *, glob pattern matching applied
// - Non abs path is relative to parent config file directory
// - Abs paths returned as is
func resolveImports(parent string, imports []string) ([]string, error) {
        var out []string

        for _, path := range imports {
                path = filepath.Clean(path)
                if !filepath.IsAbs(path) {
                        path = filepath.Join(filepath.Dir(parent), path)
                }

                if strings.Contains(path, "*") {
                        matches, err := filepath.Glob(path)
                        if err != nil {
                                return nil, err
                        }

                        out = append(out, matches...)
                } else {
                        out = append(out, path)
                }
        }

        return out, nil
}

// mergeConfig merges Config structs with the following rules:
// 'to'         'from'      'result'
// ""           "value"     "value"
// "value"      ""          "value"
// 1            0           1
// 0            1           1
// []{"1"}      []{"2"}     []{"1","2"}
// []{"1"}      []{}        []{"1"}
// []{"1", "2"} []{"1"}     []{"1","2"}
// []{}         []{"2"}     []{"2"}
// Maps merged by keys, but values are replaced entirely.
func mergeConfig(to, from *Config) error {
        err := mergo.Merge(to, from, mergo.WithOverride, mergo.WithTransformers(sliceTransformer{}))
        if err != nil {
                return err
        }

        // Replace entire sections instead of merging map's values.
        for k, v := range from.Plugins {
                to.Plugins[k] = v
        }

        for k, v := range from.StreamProcessors {
                to.StreamProcessors[k] = v
        }

        for k, v := range from.ProxyPlugins {
                to.ProxyPlugins[k] = v
        }

        for k, v := range from.Timeouts {
                to.Timeouts[k] = v
        }

        return nil
}

type sliceTransformer struct{}

func (sliceTransformer) Transformer(t reflect.Type) func(dst, src reflect.Value) error {
        if t.Kind() != reflect.Slice {
                return nil
        }
        return func(dst, src reflect.Value) error {
                if !dst.CanSet() {
                        return nil
                }
                if src.Type() != dst.Type() {
                        return fmt.Errorf("cannot append two slice with different type (%s, %s)", src.Type(), dst.Type())
                }
                for i := 0; i < src.Len(); i++ {
                        found := false
                        for j := 0; j < dst.Len(); j++ {
                                srcv := src.Index(i)
                                dstv := dst.Index(j)
                                if !srcv.CanInterface() || !dstv.CanInterface() {
                                        if srcv.Equal(dstv) {
                                                found = true
                                                break
                                        }
                                } else if reflect.DeepEqual(srcv.Interface(), dstv.Interface()) {
                                        found = true
                                        break
                                }
                        }
                        if !found {
                                dst.Set(reflect.Append(dst, src.Index(i)))
                        }
                }

                return nil
        }
}

// V2DisabledFilter matches based on URI
func V2DisabledFilter(list []string) plugin.DisableFilter {
        set := make(map[string]struct{}, len(list))
        for _, l := range list {
                set[l] = struct{}{}
        }
        return func(r *plugin.Registration) bool {
                _, ok := set[r.URI()]
                return ok
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"

        "github.com/containerd/containerd/v2/pkg/namespaces"
        "google.golang.org/grpc"
)

func unaryNamespaceInterceptor(ctx context.Context, req interface{}, _ *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
        if ns, ok := namespaces.Namespace(ctx); ok {
                // The above call checks the *incoming* metadata, this makes sure the outgoing metadata is also set
                ctx = namespaces.WithNamespace(ctx, ns)
        }
        return handler(ctx, req)
}

func streamNamespaceInterceptor(srv interface{}, ss grpc.ServerStream, _ *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
        ctx := ss.Context()
        if ns, ok := namespaces.Namespace(ctx); ok {
                // The above call checks the *incoming* metadata, this makes sure the outgoing metadata is also set
                ctx = namespaces.WithNamespace(ctx, ns)
                ss = &wrappedSSWithContext{ctx: ctx, ServerStream: ss}
        }

        return handler(srv, ss)
}

type wrappedSSWithContext struct {
        grpc.ServerStream
        ctx context.Context
}

func (w *wrappedSSWithContext) Context() context.Context {
        return w.ctx
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "crypto/tls"
        "crypto/x509"
        "errors"
        "expvar"
        "fmt"
        "io"
        "net"
        "net/http"
        "net/http/pprof"
        "os"
        "path/filepath"
        "runtime"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/log"
        "github.com/containerd/ttrpc"
        "github.com/docker/go-metrics"
        grpc_prometheus "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/prometheus/client_golang/prometheus"
        "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
        "google.golang.org/grpc"
        "google.golang.org/grpc/backoff"
        "google.golang.org/grpc/credentials"
        "google.golang.org/grpc/credentials/insecure"

        diffapi "github.com/containerd/containerd/api/services/diff/v1"
        sbapi "github.com/containerd/containerd/api/services/sandbox/v1"
        ssapi "github.com/containerd/containerd/api/services/snapshots/v1"
        srvconfig "github.com/containerd/containerd/v2/cmd/containerd/server/config"
        csproxy "github.com/containerd/containerd/v2/core/content/proxy"
        "github.com/containerd/containerd/v2/core/diff"
        diffproxy "github.com/containerd/containerd/v2/core/diff/proxy"
        sbproxy "github.com/containerd/containerd/v2/core/sandbox/proxy"
        ssproxy "github.com/containerd/containerd/v2/core/snapshots/proxy"
        "github.com/containerd/containerd/v2/defaults"
        "github.com/containerd/containerd/v2/pkg/deprecation"
        "github.com/containerd/containerd/v2/pkg/dialer"
        "github.com/containerd/containerd/v2/pkg/sys"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/containerd/v2/plugins/services/warning"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/dynamic"
        "github.com/containerd/plugin/registry"
)

// CreateTopLevelDirectories creates the top-level root and state directories.
func CreateTopLevelDirectories(config *srvconfig.Config) error {
        switch {
        case config.Root == "":
                return errors.New("root must be specified")
        case config.State == "":
                return errors.New("state must be specified")
        case config.Root == config.State:
                return errors.New("root and state must be different paths")
        }

        if err := sys.MkdirAllWithACL(config.Root, 0o711); err != nil {
                return err
        }

        if err := sys.MkdirAllWithACL(config.State, 0o711); err != nil {
                return err
        }

        if config.TempDir != "" {
                if err := sys.MkdirAllWithACL(config.TempDir, 0o711); err != nil {
                        return err
                }
                if runtime.GOOS == "windows" {
                        // On Windows, the Host Compute Service (vmcompute) will read the
                        // TEMP/TMP setting from the calling process when creating the
                        // tempdir to extract an image layer to. This allows the
                        // administrator to align the tempdir location with the same volume
                        // as the snapshot dir to avoid a copy operation when moving the
                        // extracted layer to the snapshot dir location.
                        os.Setenv("TEMP", config.TempDir)
                        os.Setenv("TMP", config.TempDir)
                } else {
                        os.Setenv("TMPDIR", config.TempDir)
                }
        }
        return nil
}

// New creates and initializes a new containerd server
func New(ctx context.Context, config *srvconfig.Config) (*Server, error) {
        var (
                currentVersion = config.Version
                migrationT     time.Duration
        )
        if currentVersion < version.ConfigVersion {
                // Migrate config to latest version
                t1 := time.Now()
                err := config.MigrateConfig(ctx)
                if err != nil {
                        return nil, err
                }
                migrationT = time.Since(t1)
        }

        if err := apply(ctx, config); err != nil {
                return nil, err
        }
        for key, sec := range config.Timeouts {
                d, err := time.ParseDuration(sec)
                if err != nil {
                        return nil, fmt.Errorf("unable to parse %s into a time duration", sec)
                }
                timeout.Set(key, d)
        }
        loaded, err := LoadPlugins(ctx, config)
        if err != nil {
                return nil, err
        }
        for id, p := range config.StreamProcessors {
                diff.RegisterProcessor(diff.BinaryHandler(id, p.Returns, p.Accepts, p.Path, p.Args, p.Env))
        }

        var prometheusServerMetricsOpts []grpc_prometheus.ServerMetricsOption
        if config.Metrics.GRPCHistogram {
                // Enable grpc time histograms to measure rpc latencies
                prometheusServerMetricsOpts = append(prometheusServerMetricsOpts, grpc_prometheus.WithServerHandlingTimeHistogram())
        }

        prometheusServerMetrics := grpc_prometheus.NewServerMetrics(prometheusServerMetricsOpts...)
        prometheus.MustRegister(prometheusServerMetrics)

        serverOpts := []grpc.ServerOption{
                grpc.StatsHandler(otelgrpc.NewServerHandler()),
                grpc.ChainStreamInterceptor(
                        streamNamespaceInterceptor,
                        prometheusServerMetrics.StreamServerInterceptor(),
                ),
                grpc.ChainUnaryInterceptor(
                        unaryNamespaceInterceptor,
                        prometheusServerMetrics.UnaryServerInterceptor(),
                ),
        }
        if config.GRPC.MaxRecvMsgSize > 0 {
                serverOpts = append(serverOpts, grpc.MaxRecvMsgSize(config.GRPC.MaxRecvMsgSize))
        }
        if config.GRPC.MaxSendMsgSize > 0 {
                serverOpts = append(serverOpts, grpc.MaxSendMsgSize(config.GRPC.MaxSendMsgSize))
        }
        ttrpcServer, err := newTTRPCServer()
        if err != nil {
                return nil, err
        }
        tcpServerOpts := serverOpts
        if config.GRPC.TCPTLSCert != "" {
                log.G(ctx).Info("setting up tls on tcp GRPC services...")

                tlsCert, err := tls.LoadX509KeyPair(config.GRPC.TCPTLSCert, config.GRPC.TCPTLSKey)
                if err != nil {
                        return nil, err
                }
                tlsConfig := &tls.Config{Certificates: []tls.Certificate{tlsCert}}

                if config.GRPC.TCPTLSCA != "" {
                        caCertPool := x509.NewCertPool()
                        caCert, err := os.ReadFile(config.GRPC.TCPTLSCA)
                        if err != nil {
                                return nil, fmt.Errorf("failed to load CA file: %w", err)
                        }
                        caCertPool.AppendCertsFromPEM(caCert)
                        tlsConfig.ClientCAs = caCertPool
                        tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert
                }

                tcpServerOpts = append(tcpServerOpts, grpc.Creds(credentials.NewTLS(tlsConfig)))
        }

        // grpcService allows GRPC services to be registered with the underlying server
        type grpcService interface {
                Register(*grpc.Server) error
        }

        // tcpService allows GRPC services to be registered with the underlying tcp server
        type tcpService interface {
                RegisterTCP(*grpc.Server) error
        }

        // ttrpcService allows TTRPC services to be registered with the underlying server
        type ttrpcService interface {
                RegisterTTRPC(*ttrpc.Server) error
        }

        var (
                grpcServer = grpc.NewServer(serverOpts...)
                tcpServer  = grpc.NewServer(tcpServerOpts...)

                grpcServices  []grpcService
                tcpServices   []tcpService
                ttrpcServices []ttrpcService

                s = &Server{
                        prometheusServerMetrics: prometheusServerMetrics,
                        grpcServer:              grpcServer,
                        tcpServer:               tcpServer,
                        ttrpcServer:             ttrpcServer,
                        config:                  config,
                }
                initialized = plugin.NewPluginSet()
                required    = make(map[string]struct{})
        )
        for _, r := range config.RequiredPlugins {
                required[r] = struct{}{}
        }

        if currentVersion < version.ConfigVersion {
                t1 := time.Now()
                // Run migration for each configuration version
                // Run each plugin migration for each version to ensure that migration logic is simple and
                // focused on upgrading from one version at a time.
                for v := currentVersion; v < version.ConfigVersion; v++ {
                        for _, p := range loaded {
                                if p.ConfigMigration != nil {
                                        if err := p.ConfigMigration(ctx, v, config.Plugins); err != nil {
                                                return nil, err
                                        }
                                }
                        }
                }
                migrationT = migrationT + time.Since(t1)
        }
        if migrationT > 0 {
                log.G(ctx).WithField("t", migrationT).Warnf("Configuration migrated from version %d, use `containerd config migrate` to avoid migration", currentVersion)
        }

        for _, p := range loaded {
                id := p.URI()
                log.G(ctx).WithFields(log.Fields{"id": id, "type": p.Type}).Info("loading plugin")
                var mustSucceed int32

                initContext := plugin.NewContext(
                        ctx,
                        initialized,
                        map[string]string{
                                plugins.PropertyRootDir:      filepath.Join(config.Root, id),
                                plugins.PropertyStateDir:     filepath.Join(config.State, id),
                                plugins.PropertyGRPCAddress:  config.GRPC.Address,
                                plugins.PropertyTTRPCAddress: config.TTRPC.Address,
                        },
                )
                initContext.RegisterReadiness = func() func() {
                        atomic.StoreInt32(&mustSucceed, 1)
                        return s.RegisterReadiness()
                }

                // load the plugin specific configuration if it is provided
                if p.Config != nil {
                        pc, err := config.Decode(ctx, id, p.Config)
                        if err != nil {
                                return nil, err
                        }
                        initContext.Config = pc
                }
                result := p.Init(initContext)
                if err := initialized.Add(result); err != nil {
                        return nil, fmt.Errorf("could not add plugin result to plugin set: %w", err)
                }

                instance, err := result.Instance()
                if err != nil {
                        if plugin.IsSkipPlugin(err) {
                                log.G(ctx).WithFields(log.Fields{"error": err, "id": id, "type": p.Type}).Info("skip loading plugin")
                        } else {
                                log.G(ctx).WithFields(log.Fields{"error": err, "id": id, "type": p.Type}).Warn("failed to load plugin")
                        }
                        if _, ok := required[id]; ok {
                                return nil, fmt.Errorf("load required plugin %s: %w", id, err)
                        }
                        // If readiness was registered during initialization, the plugin cannot fail
                        if atomic.LoadInt32(&mustSucceed) != 0 {
                                return nil, fmt.Errorf("plugin failed after registering readiness %s: %w", id, err)
                        }
                        continue
                }

                delete(required, id)
                // check for grpc services that should be registered with the server
                if src, ok := instance.(grpcService); ok {
                        grpcServices = append(grpcServices, src)
                }
                if src, ok := instance.(ttrpcService); ok {
                        ttrpcServices = append(ttrpcServices, src)
                }
                if service, ok := instance.(tcpService); ok {
                        tcpServices = append(tcpServices, service)
                }

                s.plugins = append(s.plugins, result)
        }
        if len(required) != 0 {
                var missing []string
                for id := range required {
                        missing = append(missing, id)
                }
                return nil, fmt.Errorf("required plugin %s not included", missing)
        }

        // register services after all plugins have been initialized
        for _, service := range grpcServices {
                if err := service.Register(grpcServer); err != nil {
                        return nil, err
                }
        }
        for _, service := range ttrpcServices {
                if err := service.RegisterTTRPC(ttrpcServer); err != nil {
                        return nil, err
                }
        }
        for _, service := range tcpServices {
                if err := service.RegisterTCP(tcpServer); err != nil {
                        return nil, err
                }
        }

        recordConfigDeprecations(ctx, config, initialized)
        return s, nil
}

// recordConfigDeprecations attempts to record use of any deprecated config field.  Failures are logged and ignored.
func recordConfigDeprecations(ctx context.Context, config *srvconfig.Config, set *plugin.Set) {
        // record any detected deprecations without blocking server startup
        p := set.Get(plugins.WarningPlugin, plugins.DeprecationsPlugin)
        if p == nil {
                log.G(ctx).Warn("failed to find warning service to record deprecations")
                return
        }
        instance, err := p.Instance()
        if err != nil {
                log.G(ctx).WithError(err).Warn("failed to load warning service to record deprecations")
                return
        }
        warn, ok := instance.(warning.Service)
        if !ok {
                log.G(ctx).WithError(err).Warn("failed to load warning service to record deprecations, unexpected plugin type")
                return
        }

        if config.PluginDir != "" { //nolint:staticcheck
                warn.Emit(ctx, deprecation.GoPluginLibrary)
        }
}

// Server is the containerd main daemon
type Server struct {
        prometheusServerMetrics *grpc_prometheus.ServerMetrics
        grpcServer              *grpc.Server
        ttrpcServer             *ttrpc.Server
        tcpServer               *grpc.Server
        config                  *srvconfig.Config
        plugins                 []*plugin.Plugin
        ready                   sync.WaitGroup
}

// ServeGRPC provides the containerd grpc APIs on the provided listener
func (s *Server) ServeGRPC(l net.Listener) error {
        s.prometheusServerMetrics.InitializeMetrics(s.grpcServer)
        return trapClosedConnErr(s.grpcServer.Serve(l))
}

// ServeTTRPC provides the containerd ttrpc APIs on the provided listener
func (s *Server) ServeTTRPC(l net.Listener) error {
        return trapClosedConnErr(s.ttrpcServer.Serve(context.Background(), l))
}

// ServeMetrics provides a prometheus endpoint for exposing metrics
func (s *Server) ServeMetrics(l net.Listener) error {
        m := http.NewServeMux()
        m.Handle("/v1/metrics", metrics.Handler())
        srv := &http.Server{
                Handler:           m,
                ReadHeaderTimeout: 5 * time.Minute, // "G112: Potential Slowloris Attack (gosec)"; not a real concern for our use, so setting a long timeout.
        }
        return trapClosedConnErr(srv.Serve(l))
}

// ServeTCP allows services to serve over tcp
func (s *Server) ServeTCP(l net.Listener) error {
        s.prometheusServerMetrics.InitializeMetrics(s.tcpServer)
        return trapClosedConnErr(s.tcpServer.Serve(l))
}

// ServeDebug provides a debug endpoint
func (s *Server) ServeDebug(l net.Listener) error {
        // don't use the default http server mux to make sure nothing gets registered
        // that we don't want to expose via containerd
        m := http.NewServeMux()
        m.Handle("/debug/vars", expvar.Handler())
        m.Handle("/debug/pprof/", http.HandlerFunc(pprof.Index))
        m.Handle("/debug/pprof/cmdline", http.HandlerFunc(pprof.Cmdline))
        m.Handle("/debug/pprof/profile", http.HandlerFunc(pprof.Profile))
        m.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
        m.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))
        srv := &http.Server{
                Handler:           m,
                ReadHeaderTimeout: 5 * time.Minute, // "G112: Potential Slowloris Attack (gosec)"; not a real concern for our use, so setting a long timeout.
        }
        return trapClosedConnErr(srv.Serve(l))
}

// Stop the containerd server canceling any open connections
func (s *Server) Stop() {
        s.grpcServer.Stop()
        for i := len(s.plugins) - 1; i >= 0; i-- {
                p := s.plugins[i]
                instance, err := p.Instance()
                if err != nil {
                        log.L.WithFields(log.Fields{"error": err, "id": p.Registration.URI()}).Error("could not get plugin instance")
                        continue
                }
                closer, ok := instance.(io.Closer)
                if !ok {
                        continue
                }
                if err := closer.Close(); err != nil {
                        log.L.WithFields(log.Fields{"error": err, "id": p.Registration.URI()}).Error("failed to close plugin")
                }
        }
}

func (s *Server) RegisterReadiness() func() {
        s.ready.Add(1)
        return func() {
                s.ready.Done()
        }
}

func (s *Server) Wait() {
        s.ready.Wait()
}

// LoadPlugins loads all plugins into containerd and generates an ordered graph
// of all plugins.
func LoadPlugins(ctx context.Context, config *srvconfig.Config) ([]plugin.Registration, error) {
        // load all plugins into containerd
        path := config.PluginDir //nolint:staticcheck
        if path == "" {
                path = filepath.Join(config.Root, "plugins")
        }
        if count, err := dynamic.Load(path); err != nil {
                return nil, err
        } else if count > 0 || config.PluginDir != "" { //nolint:staticcheck
                config.PluginDir = path //nolint:staticcheck
                log.G(ctx).Warningf("loaded %d dynamic plugins. `go_plugin` is deprecated, please use `external plugins` instead", count)
        }
        // load additional plugins that don't automatically register themselves
        registry.Register(&plugin.Registration{
                Type: plugins.ContentPlugin,
                ID:   "content",
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        root := ic.Properties[plugins.PropertyRootDir]
                        ic.Meta.Exports["root"] = root
                        return local.NewStore(root)
                },
        })

        clients := &proxyClients{}
        for name, pp := range config.ProxyPlugins {
                var (
                        t plugin.Type
                        f func(*grpc.ClientConn) interface{}

                        address = pp.Address
                        p       v1.Platform
                        err     error
                )

                switch pp.Type {
                case string(plugins.SnapshotPlugin), "snapshot":
                        t = plugins.SnapshotPlugin
                        ssname := name
                        f = func(conn *grpc.ClientConn) interface{} {
                                return ssproxy.NewSnapshotter(ssapi.NewSnapshotsClient(conn), ssname)
                        }

                case string(plugins.ContentPlugin), "content":
                        t = plugins.ContentPlugin
                        f = func(conn *grpc.ClientConn) interface{} {
                                return csproxy.NewContentStore(conn)
                        }
                case string(plugins.SandboxControllerPlugin), "sandbox":
                        t = plugins.SandboxControllerPlugin
                        f = func(conn *grpc.ClientConn) interface{} {
                                return sbproxy.NewSandboxController(sbapi.NewControllerClient(conn))
                        }
                case string(plugins.DiffPlugin), "diff":
                        t = plugins.DiffPlugin
                        f = func(conn *grpc.ClientConn) interface{} {
                                return diffproxy.NewDiffApplier(diffapi.NewDiffClient(conn))
                        }
                default:
                        log.G(ctx).WithField("type", pp.Type).Warn("unknown proxy plugin type")
                }
                if pp.Platform != "" {
                        p, err = platforms.Parse(pp.Platform)
                        if err != nil {
                                log.G(ctx).WithFields(log.Fields{"error": err, "plugin": name}).Warn("skipping proxy platform with bad platform")
                        }
                } else {
                        p = platforms.DefaultSpec()
                }

                exports := pp.Exports
                if exports == nil {
                        exports = map[string]string{}
                }
                exports["address"] = address

                registry.Register(&plugin.Registration{
                        Type: t,
                        ID:   name,
                        InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                                ic.Meta.Exports = exports
                                ic.Meta.Platforms = append(ic.Meta.Platforms, p)
                                conn, err := clients.getClient(address)
                                if err != nil {
                                        return nil, err
                                }
                                return f(conn), nil
                        },
                })

        }

        filter := srvconfig.V2DisabledFilter
        // return the ordered graph for plugins
        return registry.Graph(filter(config.DisabledPlugins)), nil
}

type proxyClients struct {
        m       sync.Mutex
        clients map[string]*grpc.ClientConn
}

func (pc *proxyClients) getClient(address string) (*grpc.ClientConn, error) {
        pc.m.Lock()
        defer pc.m.Unlock()
        if pc.clients == nil {
                pc.clients = map[string]*grpc.ClientConn{}
        } else if c, ok := pc.clients[address]; ok {
                return c, nil
        }

        backoffConfig := backoff.DefaultConfig
        backoffConfig.MaxDelay = 3 * time.Second
        connParams := grpc.ConnectParams{
                Backoff: backoffConfig,
        }
        gopts := []grpc.DialOption{
                grpc.WithTransportCredentials(insecure.NewCredentials()),
                grpc.WithConnectParams(connParams),
                grpc.WithContextDialer(dialer.ContextDialer),

                // TODO(stevvooe): We may need to allow configuration of this on the client.
                grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(defaults.DefaultMaxRecvMsgSize)),
                grpc.WithDefaultCallOptions(grpc.MaxCallSendMsgSize(defaults.DefaultMaxSendMsgSize)),
        }

        conn, err := grpc.Dial(dialer.DialAddress(address), gopts...)
        if err != nil {
                return nil, fmt.Errorf("failed to dial %q: %w", address, err)
        }

        pc.clients[address] = conn

        return conn, nil
}

func trapClosedConnErr(err error) error {
        if err == nil || errors.Is(err, net.ErrClosed) {
                return nil
        }
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "os"

        "github.com/containerd/cgroups/v3"
        cgroup1 "github.com/containerd/cgroups/v3/cgroup1"
        cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
        srvconfig "github.com/containerd/containerd/v2/cmd/containerd/server/config"
        "github.com/containerd/containerd/v2/pkg/sys"
        "github.com/containerd/log"
        "github.com/containerd/ttrpc"
        specs "github.com/opencontainers/runtime-spec/specs-go"
)

// apply sets config settings on the server process
func apply(ctx context.Context, config *srvconfig.Config) error {
        if config.OOMScore != 0 {
                log.G(ctx).Debugf("changing OOM score to %d", config.OOMScore)
                if err := sys.SetOOMScore(os.Getpid(), config.OOMScore); err != nil {
                        log.G(ctx).WithError(err).Errorf("failed to change OOM score to %d", config.OOMScore)
                }
        }
        if config.Cgroup.Path != "" {
                if cgroups.Mode() == cgroups.Unified {
                        cg, err := cgroupsv2.Load(config.Cgroup.Path)
                        if err != nil {
                                return err
                        }
                        if err := cg.AddProc(uint64(os.Getpid())); err != nil {
                                return err
                        }
                } else {
                        cg, err := cgroup1.Load(cgroup1.StaticPath(config.Cgroup.Path))
                        if err != nil {
                                if err != cgroup1.ErrCgroupDeleted {
                                        return err
                                }
                                if cg, err = cgroup1.New(cgroup1.StaticPath(config.Cgroup.Path), &specs.LinuxResources{}); err != nil {
                                        return err
                                }
                        }
                        if err := cg.AddProc(uint64(os.Getpid())); err != nil {
                                return err
                        }
                }
        }
        return nil
}

func newTTRPCServer() (*ttrpc.Server, error) {
        return ttrpc.NewServer(ttrpc.WithServerHandshaker(ttrpc.UnixSocketRequireSameUser()))
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package apparmor

import (
        "bytes"
        "context"
        "fmt"
        "os"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/oci"
        specs "github.com/opencontainers/runtime-spec/specs-go"
)

// WithProfile sets the provided apparmor profile to the spec
func WithProfile(profile string) oci.SpecOpts {
        return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
                s.Process.ApparmorProfile = profile
                return nil
        }
}

// WithDefaultProfile will generate a default apparmor profile under the provided name
// for the container.  It is only generated if a profile under that name does not exist.
//
// FIXME: pkg/cri/[sb]server/container_create_linux_test.go depends on go:noinline
// since Go 1.21.
//
//go:noinline
func WithDefaultProfile(name string) oci.SpecOpts {
        return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
                if err := LoadDefaultProfile(name); err != nil {
                        return err
                }
                s.Process.ApparmorProfile = name
                return nil
        }
}

// LoadDefaultProfile ensures the default profile to be loaded with the given name.
// Returns nil error if the profile is already loaded.
func LoadDefaultProfile(name string) error {
        yes, err := isLoaded(name)
        if err != nil {
                return err
        }
        if yes {
                return nil
        }
        p, err := loadData(name)
        if err != nil {
                return err
        }
        f, err := os.CreateTemp(os.Getenv("XDG_RUNTIME_DIR"), p.Name)
        if err != nil {
                return err
        }
        defer f.Close()
        path := f.Name()
        defer os.Remove(path)

        if err := generate(p, f); err != nil {
                return err
        }
        if err := load(path); err != nil {
                return fmt.Errorf("load apparmor profile %s: %w", path, err)
        }
        return nil
}

// DumpDefaultProfile dumps the default profile with the given name.
func DumpDefaultProfile(name string) (string, error) {
        p, err := loadData(name)
        if err != nil {
                return "", err
        }

        var buf bytes.Buffer
        if err := generate(p, &buf); err != nil {
                return "", err
        }
        return buf.String(), nil
}

//go:build gofuzz

// Copyright 2022 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package apparmor

import (
        "os"
)

func FuzzLoadDefaultProfile(data []byte) int {
        f, err := os.Create("fuzz_file")
        if err != nil {
                return 0
        }
        defer f.Close()
        defer os.Remove("fuzz_file")
        _, err = f.Write(data)
        if err != nil {
                return 0
        }
        _ = LoadDefaultProfile("fuzz_file")
        return 1
}

//go:build linux

/*
   Copyright The docker Authors.
   Copyright The Moby Authors.
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package apparmor

import (
        "bufio"
        "fmt"
        "io"
        "os"
        "os/exec"
        "path"
        "strings"
        "text/template"

        "github.com/containerd/log"
)

// NOTE: This code is copied from <github.com/docker/docker/profiles/apparmor>.
//       If you plan to make any changes, please make sure they are also sent
//       upstream.

const dir = "/etc/apparmor.d"

const defaultTemplate = `
{{range $value := .Imports}}
{{$value}}
{{end}}

profile {{.Name}} flags=(attach_disconnected,mediate_deleted) {
{{range $value := .InnerImports}}
  {{$value}}
{{end}}

  network,
  capability,
  file,
  umount,
  # Host (privileged) processes may send signals to container processes.
  signal (receive) peer=unconfined,
  # runc may send signals to container processes.
  signal (receive) peer=runc,
  # crun may send signals to container processes.
  signal (receive) peer=crun,
  # Manager may send signals to container processes.
  signal (receive) peer={{.DaemonProfile}},
  # Container processes may send signals amongst themselves.
  signal (send,receive) peer={{.Name}},
{{if .RootlessKit}}
  # https://github.com/containerd/nerdctl/issues/2730
  signal (receive) peer={{.RootlessKit}},
{{end}}

  deny @{PROC}/* w,   # deny write for all files directly in /proc (not in a subdir)
  # deny write to files not in /proc/<number>/** or /proc/sys/**
  deny @{PROC}/{[^1-9],[^1-9][^0-9],[^1-9s][^0-9y][^0-9s],[^1-9][^0-9][^0-9][^0-9]*}/** w,
  deny @{PROC}/sys/[^k]** w,  # deny /proc/sys except /proc/sys/k* (effectively /proc/sys/kernel)
  deny @{PROC}/sys/kernel/{?,??,[^s][^h][^m]**} w,  # deny everything except shm* in /proc/sys/kernel/
  deny @{PROC}/sysrq-trigger rwklx,
  deny @{PROC}/mem rwklx,
  deny @{PROC}/kmem rwklx,
  deny @{PROC}/kcore rwklx,

  deny mount,

  deny /sys/[^f]*/** wklx,
  deny /sys/f[^s]*/** wklx,
  deny /sys/fs/[^c]*/** wklx,
  deny /sys/fs/c[^g]*/** wklx,
  deny /sys/fs/cg[^r]*/** wklx,
  deny /sys/firmware/** rwklx,
  deny /sys/devices/virtual/powercap/** rwklx,
  deny /sys/kernel/security/** rwklx,

  # allow processes within the container to trace each other,
  # provided all other LSM and yama setting allow it.
  ptrace (trace,tracedby,read,readby) peer={{.Name}},
}
`

type data struct {
        Name          string
        Imports       []string
        InnerImports  []string
        DaemonProfile string
        RootlessKit   string
}

func cleanProfileName(profile string) string {
        // Normally profiles are suffixed by " (enforce)". AppArmor profiles cannot
        // contain spaces so this doesn't restrict daemon profile names.
        profile, _, _ = strings.Cut(profile, " ")
        if profile == "" {
                profile = "unconfined"
        }
        return profile
}

func loadData(name string) (*data, error) {
        p := data{
                Name: name,
        }

        if macroExists("tunables/global") {
                p.Imports = append(p.Imports, "#include <tunables/global>")
        } else {
                p.Imports = append(p.Imports, "@{PROC}=/proc/")
        }
        if macroExists("abstractions/base") {
                p.InnerImports = append(p.InnerImports, "#include <abstractions/base>")
        }

        // Figure out the daemon profile.
        currentProfile, err := os.ReadFile("/proc/self/attr/current")
        if err != nil {
                // If we couldn't get the daemon profile, assume we are running
                // unconfined which is generally the default.
                currentProfile = nil
        }
        p.DaemonProfile = cleanProfileName(string(currentProfile))

        // If we were running in Rootless mode, we could read `/proc/$(cat ${ROOTLESSKIT_STATE_DIR}/child_pid)/exe`,
        // but `nerdctl apparmor load` has to be executed as the root.
        // So, do not check ${ROOTLESSKIT_STATE_DIR} (nor EUID) here.
        p.RootlessKit, err = exec.LookPath("rootlesskit")
        if err != nil {
                log.L.WithError(err).Debug("apparmor: failed to determine the RootlessKit binary path")
                p.RootlessKit = ""
        }
        log.L.Debugf("apparmor: RootlessKit=%q", p.RootlessKit)

        return &p, nil
}

func generate(p *data, o io.Writer) error {
        t, err := template.New("apparmor_profile").Parse(defaultTemplate)
        if err != nil {
                return err
        }
        return t.Execute(o, p)
}

func load(path string) error {
        out, err := aaParser("-Kr", path)
        if err != nil {
                return fmt.Errorf("parser error(%q): %w", strings.TrimSpace(out), err)
        }
        return nil
}

// macrosExists checks if the passed macro exists.
func macroExists(m string) bool {
        _, err := os.Stat(path.Join(dir, m))
        return err == nil
}

func aaParser(args ...string) (string, error) {
        out, err := exec.Command("apparmor_parser", args...).CombinedOutput()
        return string(out), err
}

func isLoaded(name string) (bool, error) {
        f, err := os.Open("/sys/kernel/security/apparmor/profiles")
        if err != nil {
                return false, err
        }
        defer f.Close()
        r := bufio.NewReader(f)
        for {
                p, err := r.ReadString('\n')
                if err == io.EOF {
                        break
                }
                if err != nil {
                        return false, err
                }
                if strings.HasPrefix(p, name+" ") {
                        return true, nil
                }
        }
        return false, nil
}

//go:build gofuzz

/*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fuzz

import (
        "archive/tar"
        "bytes"
        "context"
        "io"
        "os"
        "path"

        fuzz "github.com/AdaLogics/go-fuzz-headers"

        imageArchive "github.com/containerd/containerd/v2/core/images/archive"
        "github.com/containerd/containerd/v2/pkg/archive"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/log"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// FuzzApply implements a fuzzer that applies
// a fuzzed tar archive on a directory
func FuzzApply(data []byte) int {
        ctx := context.Background()

        // Apply() is logging the below message, which is too noisy and not really useful
        // if the input is random.
        //
        // level=warning msg="ignored xattr ... in archive" error="operation not supported"
        log.G(ctx).Logger.SetLevel(log.PanicLevel)

        f := fuzz.NewConsumer(data)
        iters, err := f.GetInt()
        if err != nil {
                return 0
        }
        maxIters := 20
        tmpDir, err := os.MkdirTemp("", "prefix-test")
        if err != nil {
                return 0
        }
        defer os.RemoveAll(tmpDir)
        for i := 0; i < iters%maxIters; i++ {
                rBytes, err := f.TarBytes()
                if err != nil {
                        return 0
                }
                r := bytes.NewReader(rBytes)
                _, _ = archive.Apply(ctx, tmpDir, r)
        }
        return 1
}

// FuzzImportIndex implements a fuzzer
// that targets archive.ImportIndex()
func FuzzImportIndex(data []byte) int {
        f := fuzz.NewConsumer(data)
        tarBytes, err := f.TarBytes()
        if err != nil {
                return 0
        }
        var r *bytes.Reader
        ctx := context.Background()
        r = bytes.NewReader(tarBytes)
        shouldRequireLayoutOrManifest, err := f.GetBool()
        if err != nil {
                return 0
        }
        if shouldRequireLayoutOrManifest {
                hasLayoutOrManifest := false
                tr := tar.NewReader(r)
                for {
                        hdr, err := tr.Next()
                        if err == io.EOF {
                                break
                        }
                        if err != nil {
                                return 0
                        }
                        hdrName := path.Clean(hdr.Name)
                        switch hdrName {
                        case ocispec.ImageLayoutFile, "manifest.json":
                                hasLayoutOrManifest = true
                        }
                }
                if !hasLayoutOrManifest {
                        var buf bytes.Buffer
                        tw := tar.NewWriter(&buf)
                        defer tw.Close()
                        tr := tar.NewReader(r)
                        for {
                                hdr, err := tr.Next()
                                if err == io.EOF {
                                        break
                                }
                                if err != nil {
                                        return 0
                                }
                                fileContents, err := io.ReadAll(tr)
                                if err != nil {
                                        return 0
                                }
                                tw.WriteHeader(hdr)
                                tw.Write(fileContents)
                        }
                        manifestFileContents, err := f.GetBytes()
                        if err != nil {
                                return 0
                        }
                        tw.WriteHeader(&tar.Header{
                                Name:     "manifest.json",
                                Mode:     0644,
                                Size:     int64(len(manifestFileContents)),
                                Typeflag: tar.TypeReg,
                        })
                        tw.Write(manifestFileContents)
                        r = bytes.NewReader(buf.Bytes())
                }
        }
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }
        _, _ = imageArchive.ImportIndex(ctx, cs, r)
        return 1
}

//go:build gofuzz

/*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fuzz

import (
        "bytes"
        "context"

        fuzz "github.com/AdaLogics/go-fuzz-headers"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/pkg/namespaces"
)

func fuzzContext() (context.Context, context.CancelFunc) {
        ctx, cancel := context.WithCancel(context.Background())
        ctx = namespaces.WithNamespace(ctx, "fuzzing-namespace")
        return ctx, cancel
}

func FuzzContainerdImport(data []byte) int {
        initDaemon.Do(startDaemon)

        client, err := containerd.New(defaultAddress)
        if err != nil {
                return 0
        }
        defer client.Close()

        f := fuzz.NewConsumer(data)

        noOfImports, err := f.GetInt()
        if err != nil {
                return 0
        }
        maxImports := 20
        ctx, cancel := fuzzContext()
        defer cancel()
        for i := 0; i < noOfImports%maxImports; i++ {
                tarBytes, err := f.GetBytes()
                if err != nil {
                        return 0
                }
                _, _ = client.Import(ctx, bytes.NewReader(tarBytes))
        }
        return 1
}

//go:build gofuzz

/*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

//nolint:golint
package fuzz

import (
        "bytes"
        "context"
        _ "crypto/sha256" // required by go-digest
        "fmt"
        "os"
        "path/filepath"
        "reflect"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images/archive"
        "github.com/containerd/containerd/v2/plugins/content/local"
)

// checkBlobPath performs some basic validation
func checkBlobPath(dgst digest.Digest, root string) error {
        if err := dgst.Validate(); err != nil {
                return err
        }
        path := filepath.Join(root, "blobs", dgst.Algorithm().String(), dgst.Encoded())
        _, err := os.Stat(path)
        if err != nil {
                return err
        }
        return nil
}

// generateBlobs is a helper function to create random blobs
func generateBlobs(f *fuzz.ConsumeFuzzer) (map[digest.Digest][]byte, error) {
        blobs := map[digest.Digest][]byte{}
        blobQty, err := f.GetInt()
        if err != nil {
                return blobs, err
        }
        maxsize := 4096
        nblobs := blobQty % maxsize

        for i := 0; i < nblobs; i++ {
                digestBytes, err := f.GetBytes()
                if err != nil {
                        return blobs, err
                }

                dgst := digest.FromBytes(digestBytes)
                blobs[dgst] = digestBytes
        }

        return blobs, nil
}

// checkwrite is a wrapper around content.WriteBlob()
func checkWrite(ctx context.Context, cs content.Store, dgst digest.Digest, p []byte) (digest.Digest, error) {
        if err := content.WriteBlob(ctx, cs, dgst.String(), bytes.NewReader(p),
                ocispec.Descriptor{Size: int64(len(p)), Digest: dgst}); err != nil {
                return dgst, err
        }
        return dgst, nil
}

// populateBlobStore creates a bunch of blobs
func populateBlobStore(ctx context.Context, cs content.Store, f *fuzz.ConsumeFuzzer) (map[digest.Digest][]byte, error) {
        blobs, err := generateBlobs(f)
        if err != nil {
                return nil, err
        }

        for dgst, p := range blobs {
                _, err := checkWrite(ctx, cs, dgst, p)
                if err != nil {
                        return blobs, err
                }
        }
        return blobs, nil
}

// FuzzCSWalk implements a fuzzer that targets contentStore.Walk()
func FuzzCSWalk(data []byte) int {
        ctx := context.Background()
        expected := map[digest.Digest]struct{}{}
        found := map[digest.Digest]struct{}{}
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        defer os.RemoveAll(tmpdir)
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }

        f := fuzz.NewConsumer(data)
        blobs, err := populateBlobStore(ctx, cs, f)
        if err != nil {
                return 0
        }

        for dgst := range blobs {
                expected[dgst] = struct{}{}
        }

        if err := cs.Walk(ctx, func(bi content.Info) error {
                found[bi.Digest] = struct{}{}
                err = checkBlobPath(bi.Digest, tmpdir)
                if err != nil {
                        return err
                }
                return nil
        }); err != nil {
                return 0
        }
        if !reflect.DeepEqual(expected, found) {
                panic(fmt.Sprintf("%v != %v but should be equal", found, expected))
        }
        return 1
}

func FuzzArchiveExport(data []byte) int {
        f := fuzz.NewConsumer(data)
        manifest := ocispec.Descriptor{}
        err := f.GenerateStruct(&manifest)
        if err != nil {
                return 0
        }
        ctx := context.Background()
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        defer os.RemoveAll(tmpdir)
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }
        _, err = populateBlobStore(ctx, cs, f)
        if err != nil {
                return 0
        }
        w, err := os.Create("fuzz-output-file")
        if err != nil {
                return 0
        }
        defer w.Close()
        defer os.Remove("fuzz-output-file")
        _ = archive.Export(ctx, cs, w, archive.WithManifest(manifest, "name"))
        return 1
}

//go:build gofuzz

/*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fuzz

import (
        "context"
        "fmt"
        golangruntime "runtime"
        "strings"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/internal/cri/server"
        "github.com/containerd/containerd/v2/internal/cri/server/images"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
)

var (
        // The APIs the fuzzer can call:
        ops = map[int]string{
                0:  "createContainer",
                1:  "removeContainer",
                2:  "addSandboxes",
                3:  "listContainers",
                4:  "startContainer",
                5:  "containerStats",
                6:  "listContainerStats",
                7:  "containerStatus",
                8:  "stopContainer",
                9:  "updateContainerResources",
                10: "listImages",
                11: "removeImages",
                12: "imageStatus",
                13: "imageFsInfo",
                14: "listPodSandbox",
                15: "portForward",
                16: "removePodSandbox",
                17: "runPodSandbox",
                18: "podSandboxStatus",
                19: "stopPodSandbox",
                20: "status",
                21: "updateRuntimeConfig",
        }
        executionOrder []string
)

func printExecutions() {
        if r := recover(); r != nil {
                var err string
                switch r.(type) {
                case string:
                        err = r.(string)
                case golangruntime.Error:
                        err = r.(golangruntime.Error).Error()
                case error:
                        err = r.(error).Error()
                default:
                        err = "uknown error type"
                }
                fmt.Println("Executions:")
                for _, eo := range executionOrder {
                        fmt.Println(eo)
                }
                panic(err)
        }
}

type fuzzCRIService interface {
        server.CRIService
        runtime.RuntimeServiceServer
        runtime.ImageServiceServer
}

func fuzzCRI(f *fuzz.ConsumeFuzzer, c fuzzCRIService) int {
        executionOrder = make([]string, 0)
        defer printExecutions()

        calls, err := f.GetInt()
        if err != nil {
                return 0
        }

        executionOrder = make([]string, 0)
        defer printExecutions()

        for i := 0; i < calls%40; i++ {
                op, err := f.GetInt()
                if err != nil {
                        return 0
                }
                opType := op % len(ops)

                switch ops[opType] {
                case "createContainer":
                        createContainerFuzz(c, f)
                case "removeContainer":
                        removeContainerFuzz(c, f)
                case "addSandboxes":
                        addSandboxesFuzz(c, f)
                case "listContainers":
                        listContainersFuzz(c, f)
                case "startContainer":
                        startContainerFuzz(c, f)
                case "containerStats":
                        containerStatsFuzz(c, f)
                case "listContainerStats":
                        listContainerStatsFuzz(c, f)
                case "containerStatus":
                        containerStatusFuzz(c, f)
                case "stopContainer":
                        stopContainerFuzz(c, f)
                case "updateContainerResources":
                        updateContainerResourcesFuzz(c, f)
                case "listImages":
                        listImagesFuzz(c, f)
                case "removeImages":
                        removeImagesFuzz(c, f)
                case "imageStatus":
                        imageStatusFuzz(c, f)
                case "imageFsInfo":
                        imageFsInfoFuzz(c, f)
                case "listPodSandbox":
                        listPodSandboxFuzz(c, f)
                case "portForward":
                        portForwardFuzz(c, f)
                case "removePodSandbox":
                        removePodSandboxFuzz(c, f)
                case "runPodSandbox":
                        runPodSandboxFuzz(c, f)
                case "podSandboxStatus":
                        podSandboxStatusFuzz(c, f)
                case "stopPodSandbox":
                        stopPodSandboxFuzz(c, f)
                case "status":
                        statusFuzz(c, f)
                case "updateRuntimeConfig":
                        updateRuntimeConfigFuzz(c, f)
                }
        }
        return 1
}

func logExecution(apiName, request string) {
        var logString strings.Builder
        logString.WriteString(fmt.Sprintf("Calling %s with \n %s \n\n", apiName, request))
        executionOrder = append(executionOrder, logString.String())
}

// createContainerFuzz creates a CreateContainerRequest and passes
// it to c.CreateContainer
func createContainerFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.CreateContainerRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.CreateContainer(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.CreateContainer", reqString)
        return nil
}

// removeContainerFuzz creates a RemoveContainerRequest and passes
// it to c.RemoveContainer
func removeContainerFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.RemoveContainerRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.RemoveContainer(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.RemoveContainer", reqString)
        return nil
}

func sandboxStore(cs fuzzCRIService) (*sandboxstore.Store, error) {
        var (
                ss  *sandboxstore.Store
                err error
        )

        ss, err = server.SandboxStore(cs)
        if err != nil {
                ss, err = server.SandboxStore(cs)
                if err != nil {
                        return nil, err
                }
                return ss, nil
        }
        return ss, nil
}

// addSandboxesFuzz creates a sandbox and adds it to the sandboxstore
func addSandboxesFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        quantity, err := f.GetInt()
        if err != nil {
                return err
        }

        ss, err := sandboxStore(c)
        if err != nil {
                return err
        }

        for i := 0; i < quantity%20; i++ {
                newSandbox, err := getSandboxFuzz(f)
                if err != nil {
                        return err
                }
                err = ss.Add(newSandbox)
                if err != nil {
                        return err
                }
        }
        return nil
}

// getSandboxFuzz creates a sandbox
func getSandboxFuzz(f *fuzz.ConsumeFuzzer) (sandboxstore.Sandbox, error) {
        metadata := sandboxstore.Metadata{}
        status := sandboxstore.Status{}
        err := f.GenerateStruct(&metadata)
        if err != nil {
                return sandboxstore.Sandbox{}, err
        }
        err = f.GenerateStruct(&status)
        if err != nil {
                return sandboxstore.Sandbox{}, err
        }

        reqString := fmt.Sprintf("metadata: %+v\nstatus: %+v\n", metadata, status)
        logExecution("sandboxstore.NewSandbox", reqString)

        return sandboxstore.NewSandbox(metadata, status), nil
}

// listContainersFuzz creates a ListContainersRequest and passes
// it to c.ListContainers
func listContainersFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ListContainersRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ListContainers(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ListContainers", reqString)
        return nil
}

// startContainerFuzz creates a StartContainerRequest and passes
// it to c.StartContainer
func startContainerFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.StartContainerRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.StartContainer(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.StartContainer", reqString)
        return nil
}

// containerStatsFuzz creates a ContainerStatsRequest and passes
// it to c.ContainerStats
func containerStatsFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ContainerStatsRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ContainerStats(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ContainerStats", reqString)
        return nil
}

// listContainerStatsFuzz creates a ListContainerStatsRequest and
// passes it to c.ListContainerStats
func listContainerStatsFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ListContainerStatsRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ListContainerStats(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ListContainerStats", reqString)
        return nil
}

// containerStatusFuzz creates a ContainerStatusRequest and passes
// it to c.ContainerStatus
func containerStatusFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ContainerStatusRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ContainerStatus(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ContainerStatus", reqString)
        return nil
}

// stopContainerFuzz creates a StopContainerRequest and passes
// it to c.StopContainer
func stopContainerFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.StopContainerRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.StopContainer(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.StopContainer", reqString)
        return nil
}

// updateContainerResourcesFuzz creates a UpdateContainerResourcesRequest
// and passes it to c.UpdateContainerResources
func updateContainerResourcesFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.UpdateContainerResourcesRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.UpdateContainerResources(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.UpdateContainerResources", reqString)
        return nil
}

// listImagesFuzz creates a ListImagesRequest and passes it to
// c.ListImages
func listImagesFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ListImagesRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ListImages(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ListImages", reqString)
        return nil
}

// removeImagesFuzz creates a RemoveImageRequest and passes it to
// c.RemoveImage
func removeImagesFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.RemoveImageRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.RemoveImage(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.RemoveImage", reqString)
        return nil
}

// imageStatusFuzz creates an ImageStatusRequest and passes it to
// c.ImageStatus
func imageStatusFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ImageStatusRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ImageStatus(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ImageStatus", reqString)
        return nil
}

// imageFsInfoFuzz creates an ImageFsInfoRequest and passes it to
// c.ImageFsInfo
func imageFsInfoFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ImageFsInfoRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ImageFsInfo(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ImageFsInfo", reqString)
        return nil
}

// listPodSandboxFuzz creates a ListPodSandboxRequest and passes
// it to c.ListPodSandbox
func listPodSandboxFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.ListPodSandboxRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.ListPodSandbox(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.ListPodSandbox", reqString)
        return nil
}

// portForwardFuzz creates a PortForwardRequest and passes it to
// c.PortForward
func portForwardFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.PortForwardRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.PortForward(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.PortForward", reqString)
        return nil
}

// removePodSandboxFuzz creates a RemovePodSandboxRequest and
// passes it to c.RemovePodSandbox
func removePodSandboxFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.RemovePodSandboxRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.RemovePodSandbox(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.RemovePodSandbox", reqString)
        return nil
}

// runPodSandboxFuzz creates a RunPodSandboxRequest and passes
// it to c.RunPodSandbox
func runPodSandboxFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.RunPodSandboxRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.RunPodSandbox(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.RunPodSandbox", reqString)
        return nil
}

// podSandboxStatusFuzz creates a PodSandboxStatusRequest and
// passes it to
func podSandboxStatusFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.PodSandboxStatusRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.PodSandboxStatus(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.PodSandboxStatus", reqString)
        return nil
}

// stopPodSandboxFuzz creates a StopPodSandboxRequest and passes
// it to c.StopPodSandbox
func stopPodSandboxFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.StopPodSandboxRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.StopPodSandbox(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.StopPodSandbox", reqString)
        return nil
}

// statusFuzz creates a StatusRequest and passes it to c.Status
func statusFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.StatusRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.Status(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.Status", reqString)
        return nil
}

func updateRuntimeConfigFuzz(c fuzzCRIService, f *fuzz.ConsumeFuzzer) error {
        r := &runtime.UpdateRuntimeConfigRequest{}
        err := f.GenerateStruct(r)
        if err != nil {
                return err
        }
        _, _ = c.UpdateRuntimeConfig(context.Background(), r)
        reqString := fmt.Sprintf("%+v", r)
        logExecution("c.UpdateRuntimeConfig", reqString)
        return nil
}

// This creates a container directly in the store.
func getContainer(f *fuzz.ConsumeFuzzer) (containerstore.Container, error) {
        metadata := containerstore.Metadata{}
        status := containerstore.Status{}

        err := f.GenerateStruct(&metadata)
        if err != nil {
                return containerstore.Container{}, err
        }
        err = f.GenerateStruct(&status)
        if err != nil {
                return containerstore.Container{}, err
        }
        container, err := containerstore.NewContainer(metadata, containerstore.WithFakeStatus(status))
        return container, err
}

func FuzzParseAuth(data []byte) int {
        f := fuzz.NewConsumer(data)
        auth := &runtime.AuthConfig{}
        err := f.GenerateStruct(auth)
        if err != nil {
                return 0
        }
        host, err := f.GetString()
        if err != nil {
                return 0
        }
        _, _, _ = images.ParseAuth(auth, host)
        return 1
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fuzz

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
        "google.golang.org/grpc"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/instrument"
        "github.com/containerd/containerd/v2/internal/cri/server"
        "github.com/containerd/containerd/v2/internal/cri/server/images"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/errdefs"
)

func FuzzCRIServer(data []byte) int {
        initDaemon.Do(startDaemon)

        f := fuzz.NewConsumer(data)

        client, err := containerd.New(defaultAddress)
        if err != nil {
                return 0
        }
        defer client.Close()

        imageConfig := criconfig.ImageConfig{}

        imageService, err := images.NewService(imageConfig, &images.CRIImageServiceOptions{
                Client: client,
        })
        if err != nil {
                panic(err)
        }

        c, rs, err := server.NewCRIService(&server.CRIServiceOptions{
                RuntimeService: &fakeRuntimeService{},
                ImageService:   imageService,
                Client:         client,
        })
        if err != nil {
                panic(err)
        }

        return fuzzCRI(f, &service{
                CRIService:           c,
                RuntimeServiceServer: rs,
                ImageServiceServer:   imageService.GRPCService(),
        })
}

type fakeRuntimeService struct{}

func (fakeRuntimeService) Config() criconfig.Config {
        return criconfig.Config{}
}

func (fakeRuntimeService) LoadOCISpec(string) (*oci.Spec, error) {
        return nil, errdefs.ErrNotFound
}

type service struct {
        server.CRIService
        runtime.RuntimeServiceServer
        runtime.ImageServiceServer
}

func (c *service) Register(s *grpc.Server) error {
        instrumented := instrument.NewService(c)
        runtime.RegisterRuntimeServiceServer(s, instrumented)
        runtime.RegisterImageServiceServer(s, instrumented)
        return nil
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fuzz

import (
        "context"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/cmd/containerd/server"
        "github.com/containerd/containerd/v2/cmd/containerd/server/config"
        "github.com/containerd/containerd/v2/defaults"
        "github.com/containerd/containerd/v2/pkg/sys"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/log"
)

const (
        defaultRoot    = "/var/lib/containerd"
        defaultState   = "/tmp/containerd"
        defaultAddress = "/tmp/containerd/containerd.sock"
)

var (
        initDaemon sync.Once
)

func startDaemon() {
        ctx := context.Background()
        ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
        defer cancel()

        errC := make(chan error, 1)

        go func() {
                defer close(errC)

                srvconfig := &config.Config{
                        Version: version.ConfigVersion,
                        Root:    defaultRoot,
                        State:   defaultState,
                        Debug: config.Debug{
                                Level: "debug",
                        },
                        GRPC: config.GRPCConfig{
                                Address:        defaultAddress,
                                MaxRecvMsgSize: defaults.DefaultMaxRecvMsgSize,
                                MaxSendMsgSize: defaults.DefaultMaxSendMsgSize,
                        },
                        DisabledPlugins: []string{},
                        RequiredPlugins: []string{},
                }

                server, err := server.New(ctx, srvconfig)
                if err != nil {
                        errC <- err
                        return
                }

                l, err := sys.GetLocalListener(srvconfig.GRPC.Address, srvconfig.GRPC.UID, srvconfig.GRPC.GID)
                if err != nil {
                        errC <- err
                        return
                }

                go func() {
                        defer l.Close()
                        if err := server.ServeGRPC(l); err != nil {
                                log.G(ctx).WithError(err).WithField("address", srvconfig.GRPC.Address).Fatal("serve failure")
                        }
                }()

                server.Wait()
        }()

        var err error
        select {
        case err = <-errC:
        case <-ctx.Done():
                err = ctx.Err()
        }

        if err != nil {
                panic(err)
        }
}

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package fuzz

import (
        "context"
        _ "crypto/sha256" // required by go-digest
        "os"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        "github.com/containerd/containerd/v2/core/diff/apply"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/containerd/v2/plugins/diff/walking"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func FuzzDiffApply(data []byte) int {
        f := fuzz.NewConsumer(data)

        mountsQty, err := f.GetInt()
        if err != nil {
                return 0
        }
        mounts := make([]mount.Mount, 0)
        for i := 0; i < mountsQty%30; i++ {
                m := mount.Mount{}
                err = f.GenerateStruct(&m)
                if err != nil {
                        return 0
                }
                mounts = append(mounts, m)
        }
        desc := ocispec.Descriptor{}
        err = f.GenerateStruct(&desc)
        if err != nil {
                return 0
        }
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }
        fsa := apply.NewFileSystemApplier(cs)
        _, _ = fsa.Apply(context.Background(), desc, mounts)
        return 1
}

func FuzzDiffCompare(data []byte) int {
        f := fuzz.NewConsumer(data)

        lowerQty, err := f.GetInt()
        if err != nil {
                return 0
        }
        lower := make([]mount.Mount, 0)
        for i := 0; i < lowerQty%30; i++ {
                m := mount.Mount{}
                err = f.GenerateStruct(&m)
                if err != nil {
                        return 0
                }
                lower = append(lower, m)
        }

        upperQty, err := f.GetInt()
        if err != nil {
                return 0
        }
        upper := make([]mount.Mount, 0)
        for i := 0; i < upperQty%30; i++ {
                m := mount.Mount{}
                err = f.GenerateStruct(&m)
                if err != nil {
                        return 0
                }
                upper = append(upper, m)
        }

        ctx := context.Background()
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }
        walker := walking.NewWalkingDiff(cs)
        _, _ = walker.Compare(ctx, lower, upper)
        return 1
}

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package fuzz

import (
        "context"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/pkg/namespaces"
)

func FuzzExchange(data []byte) int {
        f := fuzz.NewConsumer(data)
        namespace, err := f.GetString()
        if err != nil {
                return 0
        }
        event := &eventstypes.ContainerCreate{}
        err = f.GenerateStruct(event)
        if err != nil {
                return 0
        }
        input, err := f.GetString()
        if err != nil {
                return 0
        }

        env := &events.Envelope{}
        err = f.GenerateStruct(env)
        if err != nil {
                return 0
        }
        ctx := namespaces.WithNamespace(context.Background(), namespace)
        exch := exchange.NewExchange()
        exch.Publish(ctx, input, event)
        exch.Forward(ctx, env)
        return 1
}

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package fuzz

import (
        "context"
        "os"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/platforms"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func FuzzImagesCheck(data []byte) int {
        f := fuzz.NewConsumer(data)
        desc := ocispec.Descriptor{}
        err := f.GenerateStruct(&desc)
        if err != nil {
                return 0
        }
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }
        _, _, _, _, _ = images.Check(context.Background(), cs, desc, platforms.Default())
        return 1
}

//go:build gofuzz

/*
   Copyright The containerd Authors.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fuzz

import (
        "context"
        "fmt"
        "os"
        "path/filepath"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        bolt "go.etcd.io/bbolt"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/containerd/v2/plugins/snapshots/native"
)

func testEnv() (context.Context, *bolt.DB, func(), error) {
        ctx, cancel := context.WithCancel(context.Background())
        ctx = namespaces.WithNamespace(ctx, "testing")

        dirname, err := os.MkdirTemp("", "fuzz-")
        if err != nil {
                return ctx, nil, nil, err
        }

        db, err := bolt.Open(filepath.Join(dirname, "meta.db"), 0644, nil)
        if err != nil {
                return ctx, nil, nil, err
        }

        return ctx, db, func() {
                db.Close()
                _ = os.RemoveAll(dirname)
                cancel()
        }, nil
}

func FuzzImageStore(data []byte) int {
        imageStoreOptions := map[int]string{
                0: "Create",
                1: "List",
                2: "Update",
                3: "Delete",
        }

        ctx, db, cancel, err := testEnv()
        if err != nil {
                return 0
        }
        defer cancel()
        store := metadata.NewImageStore(metadata.NewDB(db, nil, nil))
        f := fuzz.NewConsumer(data)
        noOfOperations, err := f.GetInt()
        if err != nil {
                return 0
        }
        maxOperations := 50
        for i := 0; i < noOfOperations%maxOperations; i++ {
                opType, err := f.GetInt()
                if err != nil {
                        return 0
                }
                switch imageStoreOptions[opType%len(imageStoreOptions)] {
                case "Create":
                        i := images.Image{}
                        err := f.GenerateStruct(&i)
                        if err != nil {
                                return 0
                        }
                        _, _ = store.Create(ctx, i)
                case "List":
                        newFs, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _, _ = store.List(ctx, newFs)
                case "Update":
                        i := images.Image{}
                        err := f.GenerateStruct(&i)
                        if err != nil {
                                return 0
                        }
                        _, _ = store.Update(ctx, i)
                case "Delete":
                        name, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _ = store.Delete(ctx, name)
                }
        }
        return 1
}

func FuzzLeaseManager(data []byte) int {
        leaseManagerOptions := map[int]string{
                0: "Create",
                1: "List",
                2: "AddResource",
                3: "Delete",
                4: "DeleteResource",
                5: "ListResources",
        }
        ctx, db, cancel, err := testEnv()
        if err != nil {
                return 0
        }
        defer cancel()
        lm := metadata.NewLeaseManager(metadata.NewDB(db, nil, nil))

        f := fuzz.NewConsumer(data)
        noOfOperations, err := f.GetInt()
        if err != nil {
                return 0
        }
        maxOperations := 50
        for i := 0; i < noOfOperations%maxOperations; i++ {
                opType, err := f.GetInt()
                if err != nil {
                        return 0
                }
                switch leaseManagerOptions[opType%len(leaseManagerOptions)] {
                case "Create":
                        err := db.Update(func(tx *bolt.Tx) error {
                                sm := make(map[string]string)
                                err2 := f.FuzzMap(&sm)
                                if err2 != nil {
                                        return err2
                                }
                                _, _ = lm.Create(ctx, leases.WithLabels(sm))
                                return nil
                        })
                        if err != nil {
                                return 0
                        }
                case "List":
                        _, _ = lm.List(ctx)
                case "AddResource":
                        l := leases.Lease{}
                        err := f.GenerateStruct(&l)
                        if err != nil {
                                return 0
                        }
                        r := leases.Resource{}
                        err = f.GenerateStruct(&r)
                        if err != nil {
                                return 0
                        }
                        db.Update(func(tx *bolt.Tx) error {
                                _ = lm.AddResource(metadata.WithTransactionContext(ctx, tx), l, r)
                                return nil
                        })
                case "Delete":
                        l := leases.Lease{}
                        err = f.GenerateStruct(&l)
                        if err != nil {
                                return 0
                        }
                        _ = lm.Delete(ctx, l)
                case "DeleteResource":
                        l := leases.Lease{}
                        err := f.GenerateStruct(&l)
                        if err != nil {
                                return 0
                        }
                        r := leases.Resource{}
                        err = f.GenerateStruct(&r)
                        if err != nil {
                                return 0
                        }
                        _ = lm.DeleteResource(ctx, l, r)
                case "ListResources":
                        l := leases.Lease{}
                        err := f.GenerateStruct(&l)
                        if err != nil {
                                return 0
                        }
                        _, _ = lm.ListResources(ctx, l)
                }
        }
        return 1
}

func FuzzContainerStore(data []byte) int {
        containerStoreOptions := map[int]string{
                0: "Create",
                1: "List",
                2: "Delete",
                3: "Update",
                4: "Get",
        }
        ctx, db, cancel, err := testEnv()
        if err != nil {
                return 0
        }
        defer cancel()

        store := metadata.NewContainerStore(metadata.NewDB(db, nil, nil))
        c := containers.Container{}
        f := fuzz.NewConsumer(data)
        noOfOperations, err := f.GetInt()
        if err != nil {
                return 0
        }
        maxOperations := 50
        for i := 0; i < noOfOperations%maxOperations; i++ {
                opType, err := f.GetInt()
                if err != nil {
                        return 0
                }
                switch containerStoreOptions[opType%len(containerStoreOptions)] {
                case "Create":
                        err := f.GenerateStruct(&c)
                        if err != nil {
                                return 0
                        }
                        db.Update(func(tx *bolt.Tx) error {
                                _, _ = store.Create(metadata.WithTransactionContext(ctx, tx), c)
                                return nil
                        })
                case "List":
                        filt, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _, _ = store.List(ctx, filt)
                case "Delete":
                        id, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _ = store.Delete(ctx, id)
                case "Update":
                        fieldpaths, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _, _ = store.Update(ctx, c, fieldpaths)
                case "Get":
                        id, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _, _ = store.Get(ctx, id)
                }
        }
        return 1
}

type testOptions struct {
        extraSnapshots map[string]func(string) (snapshots.Snapshotter, error)
}

type testOpt func(*testOptions)

func testDB(opt ...testOpt) (context.Context, *metadata.DB, func(), error) {
        ctx, cancel := context.WithCancel(context.Background())
        ctx = namespaces.WithNamespace(ctx, "testing")

        var topts testOptions

        for _, o := range opt {
                o(&topts)
        }

        dirname, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return ctx, nil, func() { cancel() }, err
        }
        defer os.RemoveAll(dirname)

        snapshotter, err := native.NewSnapshotter(filepath.Join(dirname, "native"))
        if err != nil {
                return ctx, nil, func() { cancel() }, err
        }

        snapshotters := map[string]snapshots.Snapshotter{
                "native": snapshotter,
        }

        for name, fn := range topts.extraSnapshots {
                snapshotter, err := fn(filepath.Join(dirname, name))
                if err != nil {
                        return ctx, nil, func() { cancel() }, err
                }
                snapshotters[name] = snapshotter
        }

        cs, err := local.NewStore(filepath.Join(dirname, "content"))
        if err != nil {
                return ctx, nil, func() { cancel() }, err
        }

        bdb, err := bolt.Open(filepath.Join(dirname, "metadata.db"), 0644, nil)
        if err != nil {
                return ctx, nil, func() { cancel() }, err
        }

        db := metadata.NewDB(bdb, cs, snapshotters)
        if err := db.Init(ctx); err != nil {
                return ctx, nil, func() { cancel() }, err
        }

        return ctx, db, func() {
                bdb.Close()
                if err := os.RemoveAll(dirname); err != nil {
                        fmt.Println("Failed removing temp dir")
                }
                cancel()
        }, nil
}

func FuzzContentStore(data []byte) int {
        contentStoreOptions := map[int]string{
                0: "Info",
                1: "Update",
                2: "Walk",
                3: "Delete",
                4: "ListStatuses",
                5: "Status",
                6: "Abort",
                7: "Commit",
        }
        ctx, db, cancel, err := testDB()
        defer cancel()
        if err != nil {
                return 0
        }

        cs := db.ContentStore()
        f := fuzz.NewConsumer(data)
        noOfOperations, err := f.GetInt()
        if err != nil {
                return 0
        }
        maxOperations := 50
        for i := 0; i < noOfOperations%maxOperations; i++ {
                opType, err := f.GetInt()
                if err != nil {
                        return 0
                }
                switch contentStoreOptions[opType%len(contentStoreOptions)] {
                case "Info":
                        blob, err := f.GetBytes()
                        if err != nil {
                                return 0
                        }
                        dgst := digest.FromBytes(blob)
                        err = dgst.Validate()
                        if err != nil {
                                return 0
                        }
                        _, _ = cs.Info(ctx, dgst)
                case "Update":
                        info := content.Info{}
                        err = f.GenerateStruct(&info)
                        if err != nil {
                                return 0
                        }
                        _, _ = cs.Update(ctx, info)
                case "Walk":
                        walkFn := func(info content.Info) error {
                                return nil
                        }
                        _ = cs.Walk(ctx, walkFn)
                case "Delete":
                        blob, err := f.GetBytes()
                        if err != nil {
                                return 0
                        }
                        dgst := digest.FromBytes(blob)
                        err = dgst.Validate()
                        if err != nil {
                                return 0
                        }
                        _ = cs.Delete(ctx, dgst)
                case "ListStatuses":
                        _, _ = cs.ListStatuses(ctx)
                case "Status":
                        ref, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _, _ = cs.Status(ctx, ref)
                case "Abort":
                        ref, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        _ = cs.Abort(ctx, ref)
                case "Commit":
                        desc := ocispec.Descriptor{}
                        err = f.GenerateStruct(&desc)
                        if err != nil {
                                return 0
                        }
                        ref, err := f.GetString()
                        if err != nil {
                                return 0
                        }
                        csWriter, err := cs.Writer(ctx,
                                content.WithDescriptor(desc),
                                content.WithRef(ref))
                        if err != nil {
                                return 0
                        }
                        defer csWriter.Close()
                        p, err := f.GetBytes()
                        if err != nil {
                                return 0
                        }
                        _, _ = csWriter.Write(p)
                        _ = csWriter.Commit(ctx, 0, csWriter.Digest())
                }
        }
        return 1
}

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package fuzz

import (
        "github.com/google/uuid"
)

func FuzzUUIDParse(data []byte) int {
        _, _ = uuid.Parse(string(data))
        return 1
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package seccomp

import (
        "context"
        "encoding/json"
        "fmt"
        "os"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// WithProfile receives the name of a file stored on disk comprising a json
// formatted seccomp profile, as specified by the opencontainers/runtime-spec.
// The profile is read from the file, unmarshaled, and set to the spec.
//
// FIXME: pkg/cri/[sb]server/container_create_linux_test.go depends on go:noinline
// since Go 1.21.
//
//go:noinline
func WithProfile(profile string) oci.SpecOpts {
        return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
                s.Linux.Seccomp = &specs.LinuxSeccomp{}
                f, err := os.ReadFile(profile)
                if err != nil {
                        return fmt.Errorf("cannot load seccomp profile %q: %v", profile, err)
                }
                if err := json.Unmarshal(f, s.Linux.Seccomp); err != nil {
                        return fmt.Errorf("decoding seccomp profile failed %q: %v", profile, err)
                }
                return nil
        }
}

// WithDefaultProfile sets the default seccomp profile to the spec.
// Note: must follow the setting of process capabilities
//
// FIXME: pkg/cri/[sb]server/container_create_linux_test.go depends on go:noinline
// since Go 1.21.
//
//go:noinline
func WithDefaultProfile() oci.SpecOpts {
        return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
                s.Linux.Seccomp = DefaultProfile(s)
                return nil
        }
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package seccomp

import (
        "runtime"

        "golang.org/x/sys/unix"

        "github.com/containerd/containerd/v2/pkg/kernelversion"
        "github.com/opencontainers/runtime-spec/specs-go"
)

func arches() []specs.Arch {
        switch runtime.GOARCH {
        case "amd64":
                return []specs.Arch{specs.ArchX86_64, specs.ArchX86, specs.ArchX32}
        case "arm64":
                return []specs.Arch{specs.ArchARM, specs.ArchAARCH64}
        case "mips64":
                return []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64, specs.ArchMIPS64N32}
        case "mips64n32":
                return []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64, specs.ArchMIPS64N32}
        case "mipsel64":
                return []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64, specs.ArchMIPSEL64N32}
        case "mipsel64n32":
                return []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64, specs.ArchMIPSEL64N32}
        case "s390x":
                return []specs.Arch{specs.ArchS390, specs.ArchS390X}
        case "riscv64":
                // ArchRISCV32 (SCMP_ARCH_RISCV32) does not exist
                return []specs.Arch{specs.ArchRISCV64}
        default:
                return []specs.Arch{}
        }
}

// DefaultProfile defines the allowed syscalls for the default seccomp profile.
func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp {
        nosys := uint(unix.ENOSYS)
        syscalls := []specs.LinuxSyscall{
                {
                        Names: []string{
                                "accept",
                                "accept4",
                                "access",
                                "adjtimex",
                                "alarm",
                                "bind",
                                "brk",
                                "cachestat", // kernel v6.5, libseccomp v2.5.5
                                "capget",
                                "capset",
                                "chdir",
                                "chmod",
                                "chown",
                                "chown32",
                                "clock_adjtime",
                                "clock_adjtime64",
                                "clock_getres",
                                "clock_getres_time64",
                                "clock_gettime",
                                "clock_gettime64",
                                "clock_nanosleep",
                                "clock_nanosleep_time64",
                                "close",
                                "close_range",
                                "connect",
                                "copy_file_range",
                                "creat",
                                "dup",
                                "dup2",
                                "dup3",
                                "epoll_create",
                                "epoll_create1",
                                "epoll_ctl",
                                "epoll_ctl_old",
                                "epoll_pwait",
                                "epoll_pwait2",
                                "epoll_wait",
                                "epoll_wait_old",
                                "eventfd",
                                "eventfd2",
                                "execve",
                                "execveat",
                                "exit",
                                "exit_group",
                                "faccessat",
                                "faccessat2",
                                "fadvise64",
                                "fadvise64_64",
                                "fallocate",
                                "fanotify_mark",
                                "fchdir",
                                "fchmod",
                                "fchmodat",
                                "fchmodat2", // kernel v6.6, libseccomp v2.5.5
                                "fchown",
                                "fchown32",
                                "fchownat",
                                "fcntl",
                                "fcntl64",
                                "fdatasync",
                                "fgetxattr",
                                "flistxattr",
                                "flock",
                                "fork",
                                "fremovexattr",
                                "fsetxattr",
                                "fstat",
                                "fstat64",
                                "fstatat64",
                                "fstatfs",
                                "fstatfs64",
                                "fsync",
                                "ftruncate",
                                "ftruncate64",
                                "futex",
                                "futex_requeue", // kernel v6.7, libseccomp v2.5.5
                                "futex_time64",
                                "futex_wait", // kernel v6.7, libseccomp v2.5.5
                                "futex_waitv",
                                "futex_wake", // kernel v6.7, libseccomp v2.5.5
                                "futimesat",
                                "getcpu",
                                "getcwd",
                                "getdents",
                                "getdents64",
                                "getegid",
                                "getegid32",
                                "geteuid",
                                "geteuid32",
                                "getgid",
                                "getgid32",
                                "getgroups",
                                "getgroups32",
                                "getitimer",
                                "getpeername",
                                "getpgid",
                                "getpgrp",
                                "getpid",
                                "getppid",
                                "getpriority",
                                "getrandom",
                                "getresgid",
                                "getresgid32",
                                "getresuid",
                                "getresuid32",
                                "getrlimit",
                                "get_robust_list",
                                "getrusage",
                                "getsid",
                                "getsockname",
                                "getsockopt",
                                "get_thread_area",
                                "gettid",
                                "gettimeofday",
                                "getuid",
                                "getuid32",
                                "getxattr",
                                "inotify_add_watch",
                                "inotify_init",
                                "inotify_init1",
                                "inotify_rm_watch",
                                "io_cancel",
                                "ioctl",
                                "io_destroy",
                                "io_getevents",
                                "io_pgetevents",
                                "io_pgetevents_time64",
                                "ioprio_get",
                                "ioprio_set",
                                "io_setup",
                                "io_submit",
                                "ipc",
                                "kill",
                                "landlock_add_rule",
                                "landlock_create_ruleset",
                                "landlock_restrict_self",
                                "lchown",
                                "lchown32",
                                "lgetxattr",
                                "link",
                                "linkat",
                                "listen",
                                "listxattr",
                                "llistxattr",
                                "_llseek",
                                "lremovexattr",
                                "lseek",
                                "lsetxattr",
                                "lstat",
                                "lstat64",
                                "madvise",
                                "membarrier",
                                "memfd_create",
                                "memfd_secret",
                                "mincore",
                                "mkdir",
                                "mkdirat",
                                "mknod",
                                "mknodat",
                                "mlock",
                                "mlock2",
                                "mlockall",
                                "map_shadow_stack", // kernel v6.6, libseccomp v2.5.5
                                "mmap",
                                "mmap2",
                                "mprotect",
                                "mq_getsetattr",
                                "mq_notify",
                                "mq_open",
                                "mq_timedreceive",
                                "mq_timedreceive_time64",
                                "mq_timedsend",
                                "mq_timedsend_time64",
                                "mq_unlink",
                                "mremap",
                                "msgctl",
                                "msgget",
                                "msgrcv",
                                "msgsnd",
                                "msync",
                                "munlock",
                                "munlockall",
                                "munmap",
                                "name_to_handle_at",
                                "nanosleep",
                                "newfstatat",
                                "_newselect",
                                "open",
                                "openat",
                                "openat2",
                                "pause",
                                "pidfd_open",
                                "pidfd_send_signal",
                                "pipe",
                                "pipe2",
                                "pkey_alloc",
                                "pkey_free",
                                "pkey_mprotect",
                                "poll",
                                "ppoll",
                                "ppoll_time64",
                                "prctl",
                                "pread64",
                                "preadv",
                                "preadv2",
                                "prlimit64",
                                "process_mrelease",
                                "pselect6",
                                "pselect6_time64",
                                "pwrite64",
                                "pwritev",
                                "pwritev2",
                                "read",
                                "readahead",
                                "readlink",
                                "readlinkat",
                                "readv",
                                "recv",
                                "recvfrom",
                                "recvmmsg",
                                "recvmmsg_time64",
                                "recvmsg",
                                "remap_file_pages",
                                "removexattr",
                                "rename",
                                "renameat",
                                "renameat2",
                                "restart_syscall",
                                "rmdir",
                                "rseq",
                                "rt_sigaction",
                                "rt_sigpending",
                                "rt_sigprocmask",
                                "rt_sigqueueinfo",
                                "rt_sigreturn",
                                "rt_sigsuspend",
                                "rt_sigtimedwait",
                                "rt_sigtimedwait_time64",
                                "rt_tgsigqueueinfo",
                                "sched_getaffinity",
                                "sched_getattr",
                                "sched_getparam",
                                "sched_get_priority_max",
                                "sched_get_priority_min",
                                "sched_getscheduler",
                                "sched_rr_get_interval",
                                "sched_rr_get_interval_time64",
                                "sched_setaffinity",
                                "sched_setattr",
                                "sched_setparam",
                                "sched_setscheduler",
                                "sched_yield",
                                "seccomp",
                                "select",
                                "semctl",
                                "semget",
                                "semop",
                                "semtimedop",
                                "semtimedop_time64",
                                "send",
                                "sendfile",
                                "sendfile64",
                                "sendmmsg",
                                "sendmsg",
                                "sendto",
                                "setfsgid",
                                "setfsgid32",
                                "setfsuid",
                                "setfsuid32",
                                "setgid",
                                "setgid32",
                                "setgroups",
                                "setgroups32",
                                "setitimer",
                                "setpgid",
                                "setpriority",
                                "setregid",
                                "setregid32",
                                "setresgid",
                                "setresgid32",
                                "setresuid",
                                "setresuid32",
                                "setreuid",
                                "setreuid32",
                                "setrlimit",
                                "set_robust_list",
                                "setsid",
                                "setsockopt",
                                "set_thread_area",
                                "set_tid_address",
                                "setuid",
                                "setuid32",
                                "setxattr",
                                "shmat",
                                "shmctl",
                                "shmdt",
                                "shmget",
                                "shutdown",
                                "sigaltstack",
                                "signalfd",
                                "signalfd4",
                                "sigprocmask",
                                "sigreturn",
                                "socketcall",
                                "socketpair",
                                "splice",
                                "stat",
                                "stat64",
                                "statfs",
                                "statfs64",
                                "statx",
                                "symlink",
                                "symlinkat",
                                "sync",
                                "sync_file_range",
                                "syncfs",
                                "sysinfo",
                                "tee",
                                "tgkill",
                                "time",
                                "timer_create",
                                "timer_delete",
                                "timer_getoverrun",
                                "timer_gettime",
                                "timer_gettime64",
                                "timer_settime",
                                "timer_settime64",
                                "timerfd_create",
                                "timerfd_gettime",
                                "timerfd_gettime64",
                                "timerfd_settime",
                                "timerfd_settime64",
                                "times",
                                "tkill",
                                "truncate",
                                "truncate64",
                                "ugetrlimit",
                                "umask",
                                "uname",
                                "unlink",
                                "unlinkat",
                                "utime",
                                "utimensat",
                                "utimensat_time64",
                                "utimes",
                                "vfork",
                                "vmsplice",
                                "wait4",
                                "waitid",
                                "waitpid",
                                "write",
                                "writev",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                },
                {
                        Names:  []string{"socket"},
                        Action: specs.ActAllow,
                        Args: []specs.LinuxSeccompArg{
                                {
                                        Index: 0,
                                        Value: unix.AF_VSOCK,
                                        Op:    specs.OpNotEqual,
                                },
                        },
                },
                {
                        Names:  []string{"personality"},
                        Action: specs.ActAllow,
                        Args: []specs.LinuxSeccompArg{
                                {
                                        Index: 0,
                                        Value: 0x0,
                                        Op:    specs.OpEqualTo,
                                },
                        },
                },
                {
                        Names:  []string{"personality"},
                        Action: specs.ActAllow,
                        Args: []specs.LinuxSeccompArg{
                                {
                                        Index: 0,
                                        Value: 0x0008,
                                        Op:    specs.OpEqualTo,
                                },
                        },
                },
                {
                        Names:  []string{"personality"},
                        Action: specs.ActAllow,
                        Args: []specs.LinuxSeccompArg{
                                {
                                        Index: 0,
                                        Value: 0x20000,
                                        Op:    specs.OpEqualTo,
                                },
                        },
                },
                {
                        Names:  []string{"personality"},
                        Action: specs.ActAllow,
                        Args: []specs.LinuxSeccompArg{
                                {
                                        Index: 0,
                                        Value: 0x20008,
                                        Op:    specs.OpEqualTo,
                                },
                        },
                },
                {
                        Names:  []string{"personality"},
                        Action: specs.ActAllow,
                        Args: []specs.LinuxSeccompArg{
                                {
                                        Index: 0,
                                        Value: 0xffffffff,
                                        Op:    specs.OpEqualTo,
                                },
                        },
                },
        }

        s := &specs.LinuxSeccomp{
                DefaultAction: specs.ActErrno,
                Architectures: arches(),
                Syscalls:      syscalls,
        }

        // include by kernel version
        if ok, err := kernelversion.GreaterEqualThan(
                kernelversion.KernelVersion{Kernel: 4, Major: 8}); err == nil {
                if ok {
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "process_vm_readv",
                                        "process_vm_writev",
                                        "ptrace",
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                }
        }

        // include by arch
        switch runtime.GOARCH {
        case "ppc64le":
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "sync_file_range2",
                                "swapcontext",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                })
        case "arm", "arm64":
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "arm_fadvise64_64",
                                "arm_sync_file_range",
                                "sync_file_range2",
                                "breakpoint",
                                "cacheflush",
                                "set_tls",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                })
        case "amd64":
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "arch_prctl",
                                "modify_ldt",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                })
        case "386":
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "modify_ldt",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                })
        case "s390", "s390x":
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "s390_pci_mmio_read",
                                "s390_pci_mmio_write",
                                "s390_runtime_instr",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                })
        case "riscv64":
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "riscv_flush_icache",
                        },
                        Action: specs.ActAllow,
                        Args:   []specs.LinuxSeccompArg{},
                })
        }

        admin := false
        for _, c := range sp.Process.Capabilities.Bounding {
                switch c {
                case "CAP_DAC_READ_SEARCH":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"open_by_handle_at"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_ADMIN":
                        admin = true
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "bpf",
                                        "clone",
                                        "clone3",
                                        "fanotify_init",
                                        "fsconfig",
                                        "fsmount",
                                        "fsopen",
                                        "fspick",
                                        "lookup_dcookie",
                                        "mount",
                                        "mount_setattr",
                                        "move_mount",
                                        "open_tree",
                                        "perf_event_open",
                                        "quotactl",
                                        "quotactl_fd",
                                        "setdomainname",
                                        "sethostname",
                                        "setns",
                                        "syslog",
                                        "umount",
                                        "umount2",
                                        "unshare",
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_BOOT":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"reboot"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_CHROOT":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"chroot"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_MODULE":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "delete_module",
                                        "init_module",
                                        "finit_module",
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_PACCT":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"acct"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_PTRACE":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "kcmp",
                                        "pidfd_getfd",
                                        "process_madvise",
                                        "process_vm_readv",
                                        "process_vm_writev",
                                        "ptrace",
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_RAWIO":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "iopl",
                                        "ioperm",
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_TIME":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "settimeofday",
                                        "stime",
                                        "clock_settime",
                                        "clock_settime64",
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_TTY_CONFIG":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"vhangup"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYS_NICE":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "get_mempolicy",
                                        "mbind",
                                        "set_mempolicy",
                                        "set_mempolicy_home_node", // kernel v5.17, libseccomp v2.5.4
                                },
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_SYSLOG":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"syslog"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_BPF":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"bpf"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                case "CAP_PERFMON":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names:  []string{"perf_event_open"},
                                Action: specs.ActAllow,
                                Args:   []specs.LinuxSeccompArg{},
                        })
                }
        }

        if !admin {
                switch runtime.GOARCH {
                case "s390", "s390x":
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "clone",
                                },
                                Action: specs.ActAllow,
                                Args: []specs.LinuxSeccompArg{
                                        {
                                                Index:    1,
                                                Value:    unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
                                                ValueTwo: 0,
                                                Op:       specs.OpMaskedEqual,
                                        },
                                },
                        })
                default:
                        s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                                Names: []string{
                                        "clone",
                                },
                                Action: specs.ActAllow,
                                Args: []specs.LinuxSeccompArg{
                                        {
                                                Index:    0,
                                                Value:    unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
                                                ValueTwo: 0,
                                                Op:       specs.OpMaskedEqual,
                                        },
                                },
                        })
                }
                // clone3 is explicitly requested to give ENOSYS instead of the default EPERM, when CAP_SYS_ADMIN is unset
                // https://github.com/moby/moby/pull/42681
                s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
                        Names: []string{
                                "clone3",
                        },
                        Action:   specs.ActErrno,
                        ErrnoRet: &nosys,
                })
        }

        return s
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package content

import (
        "strings"

        "github.com/containerd/containerd/v2/pkg/filters"
)

// AdaptInfo returns `filters.Adaptor` that handles `content.Info`.
func AdaptInfo(info Info) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "digest":
                        return info.Digest.String(), true
                case "size":
                        // TODO: support size based filtering
                case "labels":
                        return checkMap(fieldpath[1:], info.Labels)
                }

                return "", false
        })
}

func checkMap(fieldpath []string, m map[string]string) (string, bool) {
        if len(m) == 0 {
                return "", false
        }

        value, ok := m[strings.Join(fieldpath, ".")]
        return value, ok
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package content

import (
        "context"
        "io"
        "time"

        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Store combines the methods of content-oriented interfaces into a set that
// are commonly provided by complete implementations.
//
// Overall content lifecycle:
//   - Ingester is used to initiate a write operation (aka ingestion)
//   - IngestManager is used to manage (e.g. list, abort) active ingestions
//   - Once an ingestion is complete (see Writer.Commit), Provider is used to
//     query a single piece of content by its digest
//   - Manager is used to manage (e.g. list, delete) previously committed content
//
// Note that until ingestion is complete, its content is not visible through
// Provider or Manager. Once ingestion is complete, it is no longer exposed
// through IngestManager.
type Store interface {
        Manager
        Provider
        IngestManager
        Ingester
}

// ReaderAt extends the standard io.ReaderAt interface with reporting of Size and io.Closer
type ReaderAt interface {
        io.ReaderAt
        io.Closer
        Size() int64
}

// Provider provides a reader interface for specific content
type Provider interface {
        // ReaderAt only requires desc.Digest to be set.
        // Other fields in the descriptor may be used internally for resolving
        // the location of the actual data.
        ReaderAt(ctx context.Context, desc ocispec.Descriptor) (ReaderAt, error)
}

// Ingester writes content
type Ingester interface {
        // Writer initiates a writing operation (aka ingestion). A single ingestion
        // is uniquely identified by its ref, provided using a WithRef option.
        // Writer can be called multiple times with the same ref to access the same
        // ingestion.
        // Once all the data is written, use Writer.Commit to complete the ingestion.
        Writer(ctx context.Context, opts ...WriterOpt) (Writer, error)
}

// IngestManager provides methods for managing ingestions. An ingestion is a
// not-yet-complete writing operation initiated using Ingester and identified
// by a ref string.
type IngestManager interface {
        // Status returns the status of the provided ref.
        Status(ctx context.Context, ref string) (Status, error)

        // ListStatuses returns the status of any active ingestions whose ref match
        // the provided regular expression. If empty, all active ingestions will be
        // returned.
        ListStatuses(ctx context.Context, filters ...string) ([]Status, error)

        // Abort completely cancels the ingest operation targeted by ref.
        Abort(ctx context.Context, ref string) error
}

// Info holds content specific information
type Info struct {
        Digest    digest.Digest
        Size      int64
        CreatedAt time.Time
        UpdatedAt time.Time
        Labels    map[string]string
}

// Status of a content operation (i.e. an ingestion)
type Status struct {
        Ref       string
        Offset    int64
        Total     int64
        Expected  digest.Digest
        StartedAt time.Time
        UpdatedAt time.Time
}

// WalkFunc defines the callback for a blob walk.
type WalkFunc func(Info) error

// InfoReaderProvider provides both info and reader for the specific content.
type InfoReaderProvider interface {
        InfoProvider
        Provider
}

// InfoProvider provides info for content inspection.
type InfoProvider interface {
        // Info will return metadata about content available in the content store.
        //
        // If the content is not present, ErrNotFound will be returned.
        Info(ctx context.Context, dgst digest.Digest) (Info, error)
}

// Manager provides methods for inspecting, listing and removing content.
type Manager interface {
        InfoProvider

        // Update updates mutable information related to content.
        // If one or more fieldpaths are provided, only those
        // fields will be updated.
        // Mutable fields:
        //  labels.*
        Update(ctx context.Context, info Info, fieldpaths ...string) (Info, error)

        // Walk will call fn for each item in the content store which
        // match the provided filters. If no filters are given all
        // items will be walked.
        Walk(ctx context.Context, fn WalkFunc, filters ...string) error

        // Delete removes the content from the store.
        Delete(ctx context.Context, dgst digest.Digest) error
}

// Writer handles writing of content into a content store
type Writer interface {
        // Close closes the writer, if the writer has not been
        // committed this allows resuming or aborting.
        // Calling Close on a closed writer will not error.
        io.WriteCloser

        // Digest may return empty digest or panics until committed.
        Digest() digest.Digest

        // Commit commits the blob (but no roll-back is guaranteed on an error).
        // size and expected can be zero-value when unknown.
        // Commit always closes the writer, even on error.
        // ErrAlreadyExists aborts the writer.
        Commit(ctx context.Context, size int64, expected digest.Digest, opts ...Opt) error

        // Status returns the current state of write
        Status() (Status, error)

        // Truncate updates the size of the target blob
        Truncate(size int64) error
}

type Syncer interface {
        // Sync flushes the in-flight writes to the disk (when applicable)
        Sync() error
}

// Opt is used to alter the mutable properties of content
type Opt func(*Info) error

// WithLabels allows labels to be set on content
func WithLabels(labels map[string]string) Opt {
        return func(info *Info) error {
                info.Labels = labels
                return nil
        }
}

// WriterOpts is internally used by WriterOpt.
type WriterOpts struct {
        Ref  string
        Desc ocispec.Descriptor
}

// WriterOpt is used for passing options to Ingester.Writer.
type WriterOpt func(*WriterOpts) error

// WithDescriptor specifies an OCI descriptor.
// Writer may optionally use the descriptor internally for resolving
// the location of the actual data.
// Write does not require any field of desc to be set.
// If the data size is unknown, desc.Size should be set to 0.
// Some implementations may also accept negative values as "unknown".
func WithDescriptor(desc ocispec.Descriptor) WriterOpt {
        return func(opts *WriterOpts) error {
                opts.Desc = desc
                return nil
        }
}

// WithRef specifies a ref string.
func WithRef(ref string) WriterOpt {
        return func(opts *WriterOpts) error {
                opts.Ref = ref
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package content

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/internal/randutil"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

var ErrReset = errors.New("writer has been reset")

var bufPool = sync.Pool{
        New: func() interface{} {
                buffer := make([]byte, 1<<20)
                return &buffer
        },
}

type reader interface {
        Reader() io.Reader
}

// NewReader returns a io.Reader from a ReaderAt
func NewReader(ra ReaderAt) io.Reader {
        if rd, ok := ra.(reader); ok {
                return rd.Reader()
        }
        return io.NewSectionReader(ra, 0, ra.Size())
}

type nopCloserBytesReader struct {
        *bytes.Reader
}

func (*nopCloserBytesReader) Close() error { return nil }

type nopCloserSectionReader struct {
        *io.SectionReader
}

func (*nopCloserSectionReader) Close() error { return nil }

// BlobReadSeeker returns a read seeker for the blob from the provider.
func BlobReadSeeker(ctx context.Context, provider Provider, desc ocispec.Descriptor) (io.ReadSeekCloser, error) {
        if int64(len(desc.Data)) == desc.Size && digest.FromBytes(desc.Data) == desc.Digest {
                return &nopCloserBytesReader{bytes.NewReader(desc.Data)}, nil
        }

        ra, err := provider.ReaderAt(ctx, desc)
        if err != nil {
                return nil, err
        }
        return &nopCloserSectionReader{io.NewSectionReader(ra, 0, ra.Size())}, nil
}

// ReadBlob retrieves the entire contents of the blob from the provider.
//
// Avoid using this for large blobs, such as layers.
func ReadBlob(ctx context.Context, provider Provider, desc ocispec.Descriptor) ([]byte, error) {
        if int64(len(desc.Data)) == desc.Size && digest.FromBytes(desc.Data) == desc.Digest {
                return desc.Data, nil
        }

        ra, err := provider.ReaderAt(ctx, desc)
        if err != nil {
                return nil, err
        }
        defer ra.Close()

        p := make([]byte, ra.Size())

        n, err := ra.ReadAt(p, 0)
        if err == io.EOF {
                if int64(n) != ra.Size() {
                        err = io.ErrUnexpectedEOF
                } else {
                        err = nil
                }
        }
        return p, err
}

// WriteBlob writes data with the expected digest into the content store. If
// expected already exists, the method returns immediately and the reader will
// not be consumed.
//
// This is useful when the digest and size are known beforehand.
//
// Copy is buffered, so no need to wrap reader in buffered io.
func WriteBlob(ctx context.Context, cs Ingester, ref string, r io.Reader, desc ocispec.Descriptor, opts ...Opt) error {
        cw, err := OpenWriter(ctx, cs, WithRef(ref), WithDescriptor(desc))
        if err != nil {
                if !errdefs.IsAlreadyExists(err) {
                        return fmt.Errorf("failed to open writer: %w", err)
                }

                return nil // already present
        }
        defer cw.Close()

        return Copy(ctx, cw, r, desc.Size, desc.Digest, opts...)
}

// OpenWriter opens a new writer for the given reference, retrying if the writer
// is locked until the reference is available or returns an error.
func OpenWriter(ctx context.Context, cs Ingester, opts ...WriterOpt) (Writer, error) {
        var (
                cw    Writer
                err   error
                retry = 16
        )
        for {
                cw, err = cs.Writer(ctx, opts...)
                if err != nil {
                        if !errdefs.IsUnavailable(err) {
                                return nil, err
                        }

                        // TODO: Check status to determine if the writer is active,
                        // continue waiting while active, otherwise return lock
                        // error or abort. Requires asserting for an ingest manager

                        select {
                        case <-time.After(time.Millisecond * time.Duration(randutil.Intn(retry))):
                                if retry < 2048 {
                                        retry = retry << 1
                                }
                                continue
                        case <-ctx.Done():
                                // Propagate lock error
                                return nil, err
                        }

                }
                break
        }

        return cw, err
}

// Copy copies data with the expected digest from the reader into the
// provided content store writer. This copy commits the writer.
//
// This is useful when the digest and size are known beforehand. When
// the size or digest is unknown, these values may be empty.
//
// Copy is buffered, so no need to wrap reader in buffered io.
func Copy(ctx context.Context, cw Writer, or io.Reader, size int64, expected digest.Digest, opts ...Opt) error {
        r := or
        for i := 0; ; i++ {
                if i >= 1 {
                        log.G(ctx).WithField("digest", expected).Debugf("retrying copy due to reset")
                }

                ws, err := cw.Status()
                if err != nil {
                        return fmt.Errorf("failed to get status: %w", err)
                }
                // Reset the original reader if
                // 1. there is an offset, or
                // 2. this is a retry due to Reset error
                if ws.Offset > 0 || i > 0 {
                        r, err = seekReader(or, ws.Offset, size)
                        if err != nil {
                                return fmt.Errorf("unable to resume write to %v: %w", ws.Ref, err)
                        }
                }

                copied, err := copyWithBuffer(cw, r)
                if errors.Is(err, ErrReset) {
                        continue
                }
                if err != nil {
                        return fmt.Errorf("failed to copy: %w", err)
                }
                if size != 0 && copied < size-ws.Offset {
                        // Short writes would return its own error, this indicates a read failure
                        return fmt.Errorf("failed to read expected number of bytes: %w", io.ErrUnexpectedEOF)
                }
                if err := cw.Commit(ctx, size, expected, opts...); err != nil {
                        if errors.Is(err, ErrReset) {
                                continue
                        }
                        if !errdefs.IsAlreadyExists(err) {
                                return fmt.Errorf("failed commit on ref %q: %w", ws.Ref, err)
                        }
                }
                return nil
        }
}

// CopyReaderAt copies to a writer from a given reader at for the given
// number of bytes. This copy does not commit the writer.
func CopyReaderAt(cw Writer, ra ReaderAt, n int64) error {
        ws, err := cw.Status()
        if err != nil {
                return err
        }

        copied, err := copyWithBuffer(cw, io.NewSectionReader(ra, ws.Offset, n))
        if err != nil {
                return fmt.Errorf("failed to copy: %w", err)
        }
        if copied < n {
                // Short writes would return its own error, this indicates a read failure
                return fmt.Errorf("failed to read expected number of bytes: %w", io.ErrUnexpectedEOF)
        }
        return nil
}

// CopyReader copies to a writer from a given reader, returning
// the number of bytes copied.
// Note: if the writer has a non-zero offset, the total number
// of bytes read may be greater than those copied if the reader
// is not an io.Seeker.
// This copy does not commit the writer.
func CopyReader(cw Writer, r io.Reader) (int64, error) {
        ws, err := cw.Status()
        if err != nil {
                return 0, fmt.Errorf("failed to get status: %w", err)
        }

        if ws.Offset > 0 {
                r, err = seekReader(r, ws.Offset, 0)
                if err != nil {
                        return 0, fmt.Errorf("unable to resume write to %v: %w", ws.Ref, err)
                }
        }

        return copyWithBuffer(cw, r)
}

// seekReader attempts to seek the reader to the given offset, either by
// resolving `io.Seeker`, by detecting `io.ReaderAt`, or discarding
// up to the given offset.
func seekReader(r io.Reader, offset, size int64) (io.Reader, error) {
        // attempt to resolve r as a seeker and setup the offset.
        seeker, ok := r.(io.Seeker)
        if ok {
                nn, err := seeker.Seek(offset, io.SeekStart)
                if nn != offset {
                        if err == nil {
                                err = fmt.Errorf("unexpected seek location without seek error")
                        }
                        return nil, fmt.Errorf("failed to seek to offset %v: %w", offset, err)
                }

                if err != nil {
                        return nil, err
                }

                return r, nil
        }

        // ok, let's try io.ReaderAt!
        readerAt, ok := r.(io.ReaderAt)
        if ok && size > offset {
                sr := io.NewSectionReader(readerAt, offset, size)
                return sr, nil
        }

        // well then, let's just discard up to the offset
        n, err := copyWithBuffer(io.Discard, io.LimitReader(r, offset))
        if err != nil {
                return nil, fmt.Errorf("failed to discard to offset: %w", err)
        }
        if n != offset {
                return nil, errors.New("unable to discard to offset")
        }

        return r, nil
}

// copyWithBuffer is very similar to  io.CopyBuffer https://golang.org/pkg/io/#CopyBuffer
// but instead of using Read to read from the src, we use ReadAtLeast to make sure we have
// a full buffer before we do a write operation to dst to reduce overheads associated
// with the write operations of small buffers.
func copyWithBuffer(dst io.Writer, src io.Reader) (written int64, err error) {
        // If the reader has a WriteTo method, use it to do the copy.
        // Avoids an allocation and a copy.
        if wt, ok := src.(io.WriterTo); ok {
                return wt.WriteTo(dst)
        }
        // Similarly, if the writer has a ReadFrom method, use it to do the copy.
        if rt, ok := dst.(io.ReaderFrom); ok {
                return rt.ReadFrom(src)
        }
        bufRef := bufPool.Get().(*[]byte)
        defer bufPool.Put(bufRef)
        buf := *bufRef
        for {
                nr, er := io.ReadAtLeast(src, buf, len(buf))
                if nr > 0 {
                        nw, ew := dst.Write(buf[0:nr])
                        if nw > 0 {
                                written += int64(nw)
                        }
                        if ew != nil {
                                err = ew
                                break
                        }
                        if nr != nw {
                                err = io.ErrShortWrite
                                break
                        }
                }
                if er != nil {
                        // If an EOF happens after reading fewer than the requested bytes,
                        // ReadAtLeast returns ErrUnexpectedEOF.
                        if er != io.EOF && er != io.ErrUnexpectedEOF {
                                err = er
                        }
                        break
                }
        }
        return
}

// Exists returns whether an attempt to access the content would not error out
// with an ErrNotFound error. It will return an encountered error if it was
// different than ErrNotFound.
func Exists(ctx context.Context, provider InfoProvider, desc ocispec.Descriptor) (bool, error) {
        _, err := provider.Info(ctx, desc.Digest)
        if errdefs.IsNotFound(err) {
                return false, nil
        }
        return err == nil, err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"

        contentapi "github.com/containerd/containerd/api/services/content/v1"
        digest "github.com/opencontainers/go-digest"
)

type remoteReaderAt struct {
        ctx    context.Context
        digest digest.Digest
        size   int64
        client contentapi.TTRPCContentClient
}

func (ra *remoteReaderAt) Size() int64 {
        return ra.size
}

func (ra *remoteReaderAt) ReadAt(p []byte, off int64) (n int, err error) {
        rr := &contentapi.ReadContentRequest{
                Digest: ra.digest.String(),
                Offset: off,
                Size:   int64(len(p)),
        }
        // we need a child context with cancel, or the eventually called
        // grpc.NewStream will leak the goroutine until the whole thing is cleared.
        // See comment at https://godoc.org/google.golang.org/grpc#ClientConn.NewStream
        childCtx, cancel := context.WithCancel(ra.ctx)
        // we MUST cancel the child context; see comment above
        defer cancel()
        rc, err := ra.client.Read(childCtx, rr)
        if err != nil {
                return 0, err
        }

        for len(p) > 0 {
                var resp *contentapi.ReadContentResponse
                // fill our buffer up until we can fill p.
                resp, err = rc.Recv()
                if err != nil {
                        return n, err
                }

                copied := copy(p, resp.Data)
                n += copied
                p = p[copied:]
        }
        return n, nil
}

func (ra *remoteReaderAt) Close() error {
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"
        "fmt"
        "io"

        contentapi "github.com/containerd/containerd/api/services/content/v1"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        protobuftypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        "github.com/containerd/ttrpc"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "google.golang.org/grpc"
        "google.golang.org/protobuf/types/known/emptypb"
)

type proxyContentStore struct {
        // client is the rpc content client
        // NOTE: ttrpc is used because it is the smaller interface shared with grpc
        client contentapi.TTRPCContentClient
}

// NewContentStore returns a new content store which communicates over a GRPC
// connection using the containerd content GRPC API.
func NewContentStore(client any) content.Store {
        switch c := client.(type) {
        case contentapi.ContentClient:
                return &proxyContentStore{
                        client: convertClient{c},
                }
        case grpc.ClientConnInterface:
                return &proxyContentStore{
                        client: convertClient{contentapi.NewContentClient(c)},
                }
        case contentapi.TTRPCContentClient:
                return &proxyContentStore{
                        client: c,
                }
        case *ttrpc.Client:
                return &proxyContentStore{
                        client: contentapi.NewTTRPCContentClient(c),
                }
        default:
                panic(fmt.Errorf("unsupported content client %T: %w", client, errdefs.ErrNotImplemented))
        }
}

func (pcs *proxyContentStore) Info(ctx context.Context, dgst digest.Digest) (content.Info, error) {
        resp, err := pcs.client.Info(ctx, &contentapi.InfoRequest{
                Digest: dgst.String(),
        })
        if err != nil {
                return content.Info{}, errdefs.FromGRPC(err)
        }

        return infoFromGRPC(resp.Info), nil
}

func (pcs *proxyContentStore) Walk(ctx context.Context, fn content.WalkFunc, filters ...string) error {
        session, err := pcs.client.List(ctx, &contentapi.ListContentRequest{
                Filters: filters,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        for {
                msg, err := session.Recv()
                if err != nil {
                        if err != io.EOF {
                                return errdefs.FromGRPC(err)
                        }

                        break
                }

                for _, info := range msg.Info {
                        if err := fn(infoFromGRPC(info)); err != nil {
                                return err
                        }
                }
        }

        return nil
}

func (pcs *proxyContentStore) Delete(ctx context.Context, dgst digest.Digest) error {
        if _, err := pcs.client.Delete(ctx, &contentapi.DeleteContentRequest{
                Digest: dgst.String(),
        }); err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

// ReaderAt ignores MediaType.
func (pcs *proxyContentStore) ReaderAt(ctx context.Context, desc ocispec.Descriptor) (content.ReaderAt, error) {
        i, err := pcs.Info(ctx, desc.Digest)
        if err != nil {
                return nil, err
        }

        return &remoteReaderAt{
                ctx:    ctx,
                digest: desc.Digest,
                size:   i.Size,
                client: pcs.client,
        }, nil
}

func (pcs *proxyContentStore) Status(ctx context.Context, ref string) (content.Status, error) {
        resp, err := pcs.client.Status(ctx, &contentapi.StatusRequest{
                Ref: ref,
        })
        if err != nil {
                return content.Status{}, errdefs.FromGRPC(err)
        }

        status := resp.Status
        return content.Status{
                Ref:       status.Ref,
                StartedAt: protobuf.FromTimestamp(status.StartedAt),
                UpdatedAt: protobuf.FromTimestamp(status.UpdatedAt),
                Offset:    status.Offset,
                Total:     status.Total,
                Expected:  digest.Digest(status.Expected),
        }, nil
}

func (pcs *proxyContentStore) Update(ctx context.Context, info content.Info, fieldpaths ...string) (content.Info, error) {
        resp, err := pcs.client.Update(ctx, &contentapi.UpdateRequest{
                Info: infoToGRPC(&info),
                UpdateMask: &protobuftypes.FieldMask{
                        Paths: fieldpaths,
                },
        })
        if err != nil {
                return content.Info{}, errdefs.FromGRPC(err)
        }
        return infoFromGRPC(resp.Info), nil
}

func (pcs *proxyContentStore) ListStatuses(ctx context.Context, filters ...string) ([]content.Status, error) {
        resp, err := pcs.client.ListStatuses(ctx, &contentapi.ListStatusesRequest{
                Filters: filters,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        var statuses []content.Status
        for _, status := range resp.Statuses {
                statuses = append(statuses, content.Status{
                        Ref:       status.Ref,
                        StartedAt: protobuf.FromTimestamp(status.StartedAt),
                        UpdatedAt: protobuf.FromTimestamp(status.UpdatedAt),
                        Offset:    status.Offset,
                        Total:     status.Total,
                        Expected:  digest.Digest(status.Expected),
                })
        }

        return statuses, nil
}

// Writer ignores MediaType.
func (pcs *proxyContentStore) Writer(ctx context.Context, opts ...content.WriterOpt) (content.Writer, error) {
        var wOpts content.WriterOpts
        for _, opt := range opts {
                if err := opt(&wOpts); err != nil {
                        return nil, err
                }
        }
        wrclient, offset, err := pcs.negotiate(ctx, wOpts.Ref, wOpts.Desc.Size, wOpts.Desc.Digest)
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        return &remoteWriter{
                ref:    wOpts.Ref,
                client: wrclient,
                offset: offset,
        }, nil
}

// Abort implements asynchronous abort. It starts a new write session on the ref l
func (pcs *proxyContentStore) Abort(ctx context.Context, ref string) error {
        if _, err := pcs.client.Abort(ctx, &contentapi.AbortRequest{
                Ref: ref,
        }); err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

func (pcs *proxyContentStore) negotiate(ctx context.Context, ref string, size int64, expected digest.Digest) (contentapi.TTRPCContent_WriteClient, int64, error) {
        wrclient, err := pcs.client.Write(ctx)
        if err != nil {
                return nil, 0, err
        }

        if err := wrclient.Send(&contentapi.WriteContentRequest{
                Action:   contentapi.WriteAction_STAT,
                Ref:      ref,
                Total:    size,
                Expected: expected.String(),
        }); err != nil {
                return nil, 0, err
        }

        resp, err := wrclient.Recv()
        if err != nil {
                return nil, 0, err
        }

        return wrclient, resp.Offset, nil
}

type convertClient struct {
        contentapi.ContentClient
}

func (c convertClient) Info(ctx context.Context, req *contentapi.InfoRequest) (*contentapi.InfoResponse, error) {
        return c.ContentClient.Info(ctx, req)
}

func (c convertClient) Update(ctx context.Context, req *contentapi.UpdateRequest) (*contentapi.UpdateResponse, error) {
        return c.ContentClient.Update(ctx, req)
}

type convertListClient struct {
        contentapi.Content_ListClient
}

func (c convertClient) List(ctx context.Context, req *contentapi.ListContentRequest) (contentapi.TTRPCContent_ListClient, error) {
        lc, err := c.ContentClient.List(ctx, req)
        if lc == nil {
                return nil, err
        }
        return convertListClient{lc}, err
}

func (c convertClient) Delete(ctx context.Context, req *contentapi.DeleteContentRequest) (*emptypb.Empty, error) {
        return c.ContentClient.Delete(ctx, req)
}

type convertReadClient struct {
        contentapi.Content_ReadClient
}

func (c convertClient) Read(ctx context.Context, req *contentapi.ReadContentRequest) (contentapi.TTRPCContent_ReadClient, error) {
        rc, err := c.ContentClient.Read(ctx, req)
        if rc == nil {
                return nil, err
        }
        return convertReadClient{rc}, err
}

func (c convertClient) Status(ctx context.Context, req *contentapi.StatusRequest) (*contentapi.StatusResponse, error) {
        return c.ContentClient.Status(ctx, req)
}

func (c convertClient) ListStatuses(ctx context.Context, req *contentapi.ListStatusesRequest) (*contentapi.ListStatusesResponse, error) {
        return c.ContentClient.ListStatuses(ctx, req)
}

type convertWriteClient struct {
        contentapi.Content_WriteClient
}

func (c convertClient) Write(ctx context.Context) (contentapi.TTRPCContent_WriteClient, error) {
        wc, err := c.ContentClient.Write(ctx)
        if wc == nil {
                return nil, err
        }
        return convertWriteClient{wc}, err
}

func (c convertClient) Abort(ctx context.Context, req *contentapi.AbortRequest) (*emptypb.Empty, error) {
        return c.ContentClient.Abort(ctx, req)
}

func infoToGRPC(info *content.Info) *contentapi.Info {
        return &contentapi.Info{
                Digest:    info.Digest.String(),
                Size:      info.Size,
                CreatedAt: protobuf.ToTimestamp(info.CreatedAt),
                UpdatedAt: protobuf.ToTimestamp(info.UpdatedAt),
                Labels:    info.Labels,
        }
}

func infoFromGRPC(info *contentapi.Info) content.Info {
        return content.Info{
                Digest:    digest.Digest(info.Digest),
                Size:      info.Size,
                CreatedAt: protobuf.FromTimestamp(info.CreatedAt),
                UpdatedAt: protobuf.FromTimestamp(info.UpdatedAt),
                Labels:    info.Labels,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"
        "fmt"
        "io"

        contentapi "github.com/containerd/containerd/api/services/content/v1"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        digest "github.com/opencontainers/go-digest"
)

type remoteWriter struct {
        ref    string
        client contentapi.TTRPCContent_WriteClient
        offset int64
        digest digest.Digest
}

// send performs a synchronous req-resp cycle on the client.
func (rw *remoteWriter) send(req *contentapi.WriteContentRequest) (*contentapi.WriteContentResponse, error) {
        if err := rw.client.Send(req); err != nil {
                return nil, err
        }

        resp, err := rw.client.Recv()

        if err == nil {
                // try to keep these in sync
                if resp.Digest != "" {
                        rw.digest = digest.Digest(resp.Digest)
                }
        }

        return resp, err
}

func (rw *remoteWriter) Status() (content.Status, error) {
        resp, err := rw.send(&contentapi.WriteContentRequest{
                Action: contentapi.WriteAction_STAT,
        })
        if err != nil {
                return content.Status{}, fmt.Errorf("error getting writer status: %w", errdefs.FromGRPC(err))
        }

        return content.Status{
                Ref:       rw.ref,
                Offset:    resp.Offset,
                Total:     resp.Total,
                StartedAt: protobuf.FromTimestamp(resp.StartedAt),
                UpdatedAt: protobuf.FromTimestamp(resp.UpdatedAt),
        }, nil
}

func (rw *remoteWriter) Digest() digest.Digest {
        return rw.digest
}

func (rw *remoteWriter) Write(p []byte) (n int, err error) {
        offset := rw.offset

        resp, err := rw.send(&contentapi.WriteContentRequest{
                Action: contentapi.WriteAction_WRITE,
                Offset: offset,
                Data:   p,
        })
        if err != nil {
                return 0, fmt.Errorf("failed to send write: %w", errdefs.FromGRPC(err))
        }

        n = int(resp.Offset - offset)
        if n < len(p) {
                err = io.ErrShortWrite
        }

        rw.offset += int64(n)
        if resp.Digest != "" {
                rw.digest = digest.Digest(resp.Digest)
        }
        return
}

func (rw *remoteWriter) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) (err error) {
        defer func() {
                err1 := rw.Close()
                if err == nil {
                        err = err1
                }
        }()

        var base content.Info
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        return err
                }
        }
        resp, err := rw.send(&contentapi.WriteContentRequest{
                Action:   contentapi.WriteAction_COMMIT,
                Total:    size,
                Offset:   rw.offset,
                Expected: expected.String(),
                Labels:   base.Labels,
        })
        if err != nil {
                return fmt.Errorf("commit failed: %w", errdefs.FromGRPC(err))
        }

        if size != 0 && resp.Offset != size {
                return fmt.Errorf("unexpected size: %v != %v", resp.Offset, size)
        }

        actual := digest.Digest(resp.Digest)
        if expected != "" && actual != expected {
                return fmt.Errorf("unexpected digest: %v != %v", resp.Digest, expected)
        }

        rw.digest = actual
        rw.offset = resp.Offset
        return nil
}

func (rw *remoteWriter) Truncate(size int64) error {
        // This truncation won't actually be validated until a write is issued.
        rw.offset = size
        return nil
}

func (rw *remoteWriter) Close() error {
        return rw.client.CloseSend()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package testsuite

import (
        "bytes"
        "context"
        "fmt"
        "io"
        "math/rand"
        "os"
        "runtime"
        "sync/atomic"
        "testing"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/pkg/testutil"
        "github.com/containerd/errdefs"
        "github.com/containerd/log/logtest"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/stretchr/testify/assert"
)

const (
        emptyDigest = "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
)

// StoreInitFn initializes content store with given root and returns a function for
// destroying the content store
type StoreInitFn func(ctx context.Context, root string) (context.Context, content.Store, func() error, error)

// ContentSuite runs a test suite on the content store given a factory function.
func ContentSuite(t *testing.T, name string, storeFn StoreInitFn) {
        t.Run("Writer", makeTest(t, name, storeFn, checkContentStoreWriter))
        t.Run("UpdateStatus", makeTest(t, name, storeFn, checkUpdateStatus))
        t.Run("CommitExists", makeTest(t, name, storeFn, checkCommitExists))
        t.Run("Resume", makeTest(t, name, storeFn, checkResumeWriter))
        t.Run("ResumeTruncate", makeTest(t, name, storeFn, checkResume(resumeTruncate)))
        t.Run("ResumeDiscard", makeTest(t, name, storeFn, checkResume(resumeDiscard)))
        t.Run("ResumeCopy", makeTest(t, name, storeFn, checkResume(resumeCopy)))
        t.Run("ResumeCopySeeker", makeTest(t, name, storeFn, checkResume(resumeCopySeeker)))
        t.Run("ResumeCopyReaderAt", makeTest(t, name, storeFn, checkResume(resumeCopyReaderAt)))
        t.Run("SmallBlob", makeTest(t, name, storeFn, checkSmallBlob))
        t.Run("Labels", makeTest(t, name, storeFn, checkLabels))

        t.Run("CommitErrorState", makeTest(t, name, storeFn, checkCommitErrorState))
}

// ContentCrossNSSharedSuite runs a test suite under shared content policy
func ContentCrossNSSharedSuite(t *testing.T, name string, storeFn StoreInitFn) {
        t.Run("CrossNamespaceAppend", makeTest(t, name, storeFn, checkCrossNSAppend))
        t.Run("CrossNamespaceShare", makeTest(t, name, storeFn, checkCrossNSShare))
}

// ContentCrossNSIsolatedSuite runs a test suite under isolated content policy
func ContentCrossNSIsolatedSuite(t *testing.T, name string, storeFn StoreInitFn) {
        t.Run("CrossNamespaceIsolate", makeTest(t, name, storeFn, checkCrossNSIsolate))
}

// ContentSharedNSIsolatedSuite runs a test suite for shared namespaces under isolated content policy
func ContentSharedNSIsolatedSuite(t *testing.T, name string, storeFn StoreInitFn) {
        t.Run("SharedNamespaceIsolate", makeTest(t, name, storeFn, checkSharedNSIsolate))
}

// ContextWrapper is used to decorate new context used inside the test
// before using the context on the content store.
// This can be used to support leasing and multiple namespaces tests.
type ContextWrapper func(ctx context.Context, sharedNS bool) (context.Context, func(context.Context) error, error)

type wrapperKey struct{}

// SetContextWrapper sets the wrapper on the context for deriving
// new test contexts from the context.
func SetContextWrapper(ctx context.Context, w ContextWrapper) context.Context {
        return context.WithValue(ctx, wrapperKey{}, w)
}

type nameKey struct{}

// Name gets the test name from the context
func Name(ctx context.Context) string {
        name, ok := ctx.Value(nameKey{}).(string)
        if !ok {
                return ""
        }
        return name
}

func makeTest(t *testing.T, name string, storeFn func(ctx context.Context, root string) (context.Context, content.Store, func() error, error), fn func(ctx context.Context, t *testing.T, cs content.Store)) func(t *testing.T) {
        return func(t *testing.T) {
                ctx := context.WithValue(context.Background(), nameKey{}, name)
                ctx = logtest.WithT(ctx, t)

                tmpDir, err := os.MkdirTemp("", "content-suite-"+name+"-")
                if err != nil {
                        t.Fatal(err)
                }
                defer os.RemoveAll(tmpDir)

                ctx, cs, cleanup, err := storeFn(ctx, tmpDir)
                if err != nil {
                        t.Fatal(err)
                }
                defer func() {
                        if err := cleanup(); err != nil && !t.Failed() {
                                t.Fatalf("Cleanup failed: %+v", err)
                        }
                }()

                w, ok := ctx.Value(wrapperKey{}).(ContextWrapper)
                if ok {
                        var done func(context.Context) error
                        ctx, done, err = w(ctx, false)
                        if err != nil {
                                t.Fatalf("Error wrapping context: %+v", err)
                        }
                        defer func() {
                                if err := done(ctx); err != nil && !t.Failed() {
                                        t.Fatalf("Wrapper release failed: %+v", err)
                                }
                        }()
                }

                defer testutil.DumpDirOnFailure(t, tmpDir)
                fn(ctx, t, cs)
        }
}

var labels = map[string]string{
        "containerd.io/gc.root": time.Now().UTC().Format(time.RFC3339),
}

func checkContentStoreWriter(ctx context.Context, t *testing.T, cs content.Store) {
        c1, d1 := createContent(256)
        w1, err := content.OpenWriter(ctx, cs, content.WithRef("c1"))
        if err != nil {
                t.Fatal(err)
        }
        defer w1.Close()

        c2, d2 := createContent(256)
        w2, err := content.OpenWriter(ctx, cs, content.WithRef("c2"), content.WithDescriptor(ocispec.Descriptor{Size: int64(len(c2))}))
        if err != nil {
                t.Fatal(err)
        }
        defer w2.Close()

        c3, d3 := createContent(256)
        w3, err := content.OpenWriter(ctx, cs, content.WithRef("c3"), content.WithDescriptor(ocispec.Descriptor{Digest: d3}))
        if err != nil {
                t.Fatal(err)
        }
        defer w3.Close()

        c4, d4 := createContent(256)
        w4, err := content.OpenWriter(ctx, cs, content.WithRef("c4"), content.WithDescriptor(ocispec.Descriptor{Size: int64(len(c4)), Digest: d4}))
        if err != nil {
                t.Fatal(err)
        }
        defer w4.Close()

        smallbuf := make([]byte, 32)
        for _, s := range []struct {
                content []byte
                digest  digest.Digest
                writer  content.Writer
        }{
                {
                        content: c1,
                        digest:  d1,
                        writer:  w1,
                },
                {
                        content: c2,
                        digest:  d2,
                        writer:  w2,
                },
                {
                        content: c3,
                        digest:  d3,
                        writer:  w3,
                },
                {
                        content: c4,
                        digest:  d4,
                        writer:  w4,
                },
        } {
                n, err := io.CopyBuffer(s.writer, bytes.NewReader(s.content), smallbuf)
                if err != nil {
                        t.Fatal(err)
                }

                if n != int64(len(s.content)) {
                        t.Fatalf("Unexpected copy length %d, expected %d", n, len(s.content))
                }

                preCommit := time.Now()
                if err := s.writer.Commit(ctx, 0, "", content.WithLabels(labels)); err != nil {
                        t.Fatal(err)
                }
                postCommit := time.Now()

                if s.writer.Digest() != s.digest {
                        t.Fatalf("Unexpected commit digest %s, expected %s", s.writer.Digest(), s.digest)
                }

                info := content.Info{
                        Digest: s.digest,
                        Size:   int64(len(s.content)),
                        Labels: labels,
                }
                if err := checkInfo(ctx, cs, s.digest, info, preCommit, postCommit, preCommit, postCommit); err != nil {
                        t.Fatalf("Check info failed: %+v", err)
                }
        }
}

func checkResumeWriter(ctx context.Context, t *testing.T, cs content.Store) {
        checkWrite := func(t *testing.T, w io.Writer, p []byte) {
                t.Helper()
                n, err := w.Write(p)
                if err != nil {
                        t.Fatal(err)
                }

                if n != len(p) {
                        t.Fatal("short write to content store")
                }
        }

        var (
                ref           = "cb"
                cb, dgst      = createContent(256)
                first, second = cb[:128], cb[128:]
        )

        preStart := time.Now()
        w1, err := content.OpenWriter(ctx, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: 256, Digest: dgst}))
        if err != nil {
                t.Fatal(err)
        }
        postStart := time.Now()
        preUpdate := postStart

        checkWrite(t, w1, first)
        postUpdate := time.Now()

        dgstFirst := digest.FromBytes(first)
        expected := content.Status{
                Ref:      ref,
                Offset:   int64(len(first)),
                Total:    int64(len(cb)),
                Expected: dgstFirst,
        }

        checkStatus(t, w1, expected, dgstFirst, preStart, postStart, preUpdate, postUpdate)
        assert.Nil(t, w1.Close(), "close first writer")

        w2, err := content.OpenWriter(ctx, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: 256, Digest: dgst}))
        if err != nil {
                t.Fatal(err)
        }

        // status should be consistent with version before close.
        checkStatus(t, w2, expected, dgstFirst, preStart, postStart, preUpdate, postUpdate)

        preUpdate = time.Now()
        checkWrite(t, w2, second)
        postUpdate = time.Now()

        expected.Offset = expected.Total
        expected.Expected = dgst
        checkStatus(t, w2, expected, dgst, preStart, postStart, preUpdate, postUpdate)

        preCommit := time.Now()
        if err := w2.Commit(ctx, 0, ""); err != nil {
                t.Fatalf("commit failed: %+v", err)
        }
        postCommit := time.Now()

        assert.Nil(t, w2.Close(), "close second writer")
        info := content.Info{
                Digest: dgst,
                Size:   256,
        }

        if err := checkInfo(ctx, cs, dgst, info, preCommit, postCommit, preCommit, postCommit); err != nil {
                t.Fatalf("Check info failed: %+v", err)
        }
}

func checkCommitExists(ctx context.Context, t *testing.T, cs content.Store) {
        c1, d1 := createContent(256)
        if err := content.WriteBlob(ctx, cs, "c1", bytes.NewReader(c1), ocispec.Descriptor{Digest: d1}); err != nil {
                t.Fatal(err)
        }

        for i, tc := range []struct {
                expected digest.Digest
        }{
                {
                        expected: d1,
                },
                {},
        } {
                w, err := content.OpenWriter(ctx, cs, content.WithRef(fmt.Sprintf("c1-commitexists-%d", i)))
                if err != nil {
                        t.Fatal(err)
                }
                if _, err := w.Write(c1); err != nil {
                        w.Close()
                        t.Fatal(err)
                }
                err = w.Commit(ctx, int64(len(c1)), tc.expected)
                w.Close()
                if err == nil {
                        t.Errorf("(%d) Expected already exists error", i)
                } else if !errdefs.IsAlreadyExists(err) {
                        t.Fatalf("(%d) Unexpected error: %+v", i, err)
                }
        }
}

func checkRefNotAvailable(ctx context.Context, t *testing.T, cs content.Store, ref string) {
        t.Helper()

        w, err := cs.Writer(ctx, content.WithRef(ref))
        if err == nil {
                defer w.Close()
                t.Fatal("writer created with ref, expected to be in use")
        }
        if !errdefs.IsUnavailable(err) {
                t.Fatalf("Expected unavailable error, got %+v", err)
        }
}

func checkCommitErrorState(ctx context.Context, t *testing.T, cs content.Store) {
        c1, d1 := createContent(256)
        _, d2 := createContent(256)
        if err := content.WriteBlob(ctx, cs, "c1", bytes.NewReader(c1), ocispec.Descriptor{Digest: d1}); err != nil {
                t.Fatal(err)
        }

        ref := "c1-commiterror-state"
        w, err := content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatal(err)
        }
        if _, err := w.Write(c1); err != nil {
                if err := w.Close(); err != nil {
                        t.Errorf("Close error: %+v", err)
                }
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        // Check exists
        err = w.Commit(ctx, int64(len(c1)), d1)
        if err == nil {
                t.Fatalf("Expected already exists error")
        } else if !errdefs.IsAlreadyExists(err) {
                if err := w.Close(); err != nil {
                        t.Errorf("Close error: %+v", err)
                }
                t.Fatalf("Unexpected error: %+v", err)
        }

        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        if _, err := w.Write(c1); err != nil {
                if err := w.Close(); err != nil {
                        t.Errorf("close error: %+v", err)
                }
                t.Fatal(err)
        }

        // Check exists without providing digest
        err = w.Commit(ctx, int64(len(c1)), "")
        if err == nil {
                t.Fatalf("Expected already exists error")
        } else if !errdefs.IsAlreadyExists(err) {
                if err := w.Close(); err != nil {
                        t.Errorf("Close error: %+v", err)
                }
                t.Fatalf("Unexpected error: %+v", err)
        }
        w.Close()

        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        if _, err := w.Write(append(c1, []byte("more")...)); err != nil {
                if err := w.Close(); err != nil {
                        t.Errorf("close error: %+v", err)
                }
                t.Fatal(err)
        }

        // Commit with the wrong digest should produce an error
        err = w.Commit(ctx, int64(len(c1))+4, d2)
        if err == nil {
                t.Fatalf("Expected error from wrong digest")
        } else if !errdefs.IsFailedPrecondition(err) {
                t.Errorf("Unexpected error: %+v", err)
        }

        w.Close()
        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        // Commit with wrong size should also produce an error
        err = w.Commit(ctx, int64(len(c1)), "")
        if err == nil {
                t.Fatalf("Expected error from wrong size")
        } else if !errdefs.IsFailedPrecondition(err) {
                t.Errorf("Unexpected error: %+v", err)
        }

        w.Close()
        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        // Now expect commit to succeed
        if err := w.Commit(ctx, int64(len(c1))+4, ""); err != nil {
                if err := w.Close(); err != nil {
                        t.Errorf("close error: %+v", err)
                }
                t.Fatalf("Failed to commit: %+v", err)
        }

        w.Close()
        // Create another writer with same reference
        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatalf("Failed to open writer: %+v", err)
        }

        if _, err := w.Write(c1); err != nil {
                if err := w.Close(); err != nil {
                        t.Errorf("close error: %+v", err)
                }
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        // Commit should fail due to already exists
        err = w.Commit(ctx, int64(len(c1)), d1)
        if err == nil {
                t.Fatalf("Expected already exists error")
        } else if !errdefs.IsAlreadyExists(err) {
                if err := w.Close(); err != nil {
                        t.Errorf("close error: %+v", err)
                }
                t.Fatalf("Unexpected error: %+v", err)
        }

        w.Close()
        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatal(err)
        }

        checkRefNotAvailable(ctx, t, cs, ref)

        if err := w.Close(); err != nil {
                t.Fatalf("Close failed: %+v", err)
        }

        // Create another writer with same reference to check available
        w, err = content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                t.Fatalf("Failed to open writer: %+v", err)
        }
        if err := w.Close(); err != nil {
                t.Fatalf("Close failed: %+v", err)
        }
}

func checkUpdateStatus(ctx context.Context, t *testing.T, cs content.Store) {
        c1, d1 := createContent(256)

        preStart := time.Now()
        w1, err := content.OpenWriter(ctx, cs, content.WithRef("c1"), content.WithDescriptor(ocispec.Descriptor{Size: 256, Digest: d1}))
        if err != nil {
                t.Fatal(err)
        }
        defer w1.Close()
        postStart := time.Now()

        d := digest.FromBytes([]byte{})

        expected := content.Status{
                Ref:      "c1",
                Total:    256,
                Expected: d1,
        }
        preUpdate := preStart
        postUpdate := postStart

        checkStatus(t, w1, expected, d, preStart, postStart, preUpdate, postUpdate)

        // Write first 64 bytes
        preUpdate = time.Now()
        if _, err := w1.Write(c1[:64]); err != nil {
                t.Fatalf("Failed to write: %+v", err)
        }
        postUpdate = time.Now()
        expected.Offset = 64
        d = digest.FromBytes(c1[:64])
        checkStatus(t, w1, expected, d, preStart, postStart, preUpdate, postUpdate)

        // Write next 128 bytes
        preUpdate = time.Now()
        if _, err := w1.Write(c1[64:192]); err != nil {
                t.Fatalf("Failed to write: %+v", err)
        }
        postUpdate = time.Now()
        expected.Offset = 192
        d = digest.FromBytes(c1[:192])
        checkStatus(t, w1, expected, d, preStart, postStart, preUpdate, postUpdate)

        // Write last 64 bytes
        preUpdate = time.Now()
        if _, err := w1.Write(c1[192:]); err != nil {
                t.Fatalf("Failed to write: %+v", err)
        }
        postUpdate = time.Now()
        expected.Offset = 256
        checkStatus(t, w1, expected, d1, preStart, postStart, preUpdate, postUpdate)

        preCommit := time.Now()
        if err := w1.Commit(ctx, 0, "", content.WithLabels(labels)); err != nil {
                t.Fatalf("Commit failed: %+v", err)
        }
        postCommit := time.Now()

        info := content.Info{
                Digest: d1,
                Size:   256,
                Labels: labels,
        }

        if err := checkInfo(ctx, cs, d1, info, preCommit, postCommit, preCommit, postCommit); err != nil {
                t.Fatalf("Check info failed: %+v", err)
        }
}

func checkLabels(ctx context.Context, t *testing.T, cs content.Store) {
        c1, d1 := createContent(256)

        w1, err := content.OpenWriter(ctx, cs, content.WithRef("c1-checklabels"), content.WithDescriptor(ocispec.Descriptor{Size: 256, Digest: d1}))
        if err != nil {
                t.Fatal(err)
        }
        defer w1.Close()

        if _, err := w1.Write(c1); err != nil {
                t.Fatalf("Failed to write: %+v", err)
        }

        rootTime := time.Now().UTC().Format(time.RFC3339)
        labels := map[string]string{
                "k1": "v1",
                "k2": "v2",

                "containerd.io/gc.root": rootTime,
        }

        preCommit := time.Now()
        if err := w1.Commit(ctx, 0, "", content.WithLabels(labels)); err != nil {
                t.Fatalf("Commit failed: %+v", err)
        }
        postCommit := time.Now()

        info := content.Info{
                Digest: d1,
                Size:   256,
                Labels: labels,
        }

        if err := checkInfo(ctx, cs, d1, info, preCommit, postCommit, preCommit, postCommit); err != nil {
                t.Fatalf("Check info failed: %+v", err)
        }

        labels["k1"] = "newvalue"
        delete(labels, "k2")
        labels["k3"] = "v3"

        info.Labels = labels
        preUpdate := time.Now()
        if _, err := cs.Update(ctx, info); err != nil {
                t.Fatalf("Update failed: %+v", err)
        }
        postUpdate := time.Now()

        if err := checkInfo(ctx, cs, d1, info, preCommit, postCommit, preUpdate, postUpdate); err != nil {
                t.Fatalf("Check info failed: %+v", err)
        }

        info.Labels = map[string]string{
                "k1": "v1",

                "containerd.io/gc.root": rootTime,
        }
        preUpdate = time.Now()
        if _, err := cs.Update(ctx, info, "labels.k3", "labels.k1"); err != nil {
                t.Fatalf("Update failed: %+v", err)
        }
        postUpdate = time.Now()

        if err := checkInfo(ctx, cs, d1, info, preCommit, postCommit, preUpdate, postUpdate); err != nil {
                t.Fatalf("Check info failed: %+v", err)
        }

}

func checkResume(rf func(context.Context, content.Writer, []byte, int64, int64, digest.Digest) error) func(ctx context.Context, t *testing.T, cs content.Store) {
        return func(ctx context.Context, t *testing.T, cs content.Store) {
                sizes := []int64{500, 5000, 50000}
                truncations := []float64{0.0, 0.1, 0.5, 0.9, 1.0}

                for i, size := range sizes {
                        for j, tp := range truncations {
                                b, d := createContent(size)
                                limit := int64(float64(size) * tp)
                                ref := fmt.Sprintf("ref-%d-%d", i, j)

                                w, err := content.OpenWriter(ctx, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: size, Digest: d}))
                                if err != nil {
                                        t.Fatal(err)
                                }

                                if _, err := w.Write(b[:limit]); err != nil {
                                        w.Close()
                                        t.Fatal(err)
                                }

                                if err := w.Close(); err != nil {
                                        t.Fatal(err)
                                }

                                w, err = content.OpenWriter(ctx, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: size, Digest: d}))
                                if err != nil {
                                        t.Fatal(err)
                                }

                                st, err := w.Status()
                                if err != nil {
                                        w.Close()
                                        t.Fatal(err)
                                }

                                if st.Offset != limit {
                                        w.Close()
                                        t.Fatalf("Unexpected offset %d, expected %d", st.Offset, limit)
                                }

                                preCommit := time.Now()
                                if err := rf(ctx, w, b, limit, size, d); err != nil {
                                        t.Fatalf("Resume failed: %+v", err)
                                }
                                postCommit := time.Now()

                                if err := w.Close(); err != nil {
                                        t.Fatal(err)
                                }

                                info := content.Info{
                                        Digest: d,
                                        Size:   size,
                                }

                                if err := checkInfo(ctx, cs, d, info, preCommit, postCommit, preCommit, postCommit); err != nil {
                                        t.Fatalf("Check info failed: %+v", err)
                                }
                        }
                }
        }
}

func resumeTruncate(ctx context.Context, w content.Writer, b []byte, written, size int64, dgst digest.Digest) error {
        if err := w.Truncate(0); err != nil {
                return fmt.Errorf("truncate failed: %w", err)
        }

        if _, err := io.CopyBuffer(w, bytes.NewReader(b), make([]byte, 1024)); err != nil {
                return fmt.Errorf("write failed: %w", err)
        }
        if err := w.Commit(ctx, size, dgst); err != nil {
                return fmt.Errorf("commit failed: %w", err)
        }
        return nil
}

func resumeDiscard(ctx context.Context, w content.Writer, b []byte, written, size int64, dgst digest.Digest) error {
        if _, err := io.CopyBuffer(w, bytes.NewReader(b[written:]), make([]byte, 1024)); err != nil {
                return fmt.Errorf("write failed: %w", err)
        }
        if err := w.Commit(ctx, size, dgst); err != nil {
                return fmt.Errorf("commit failed: %w", err)

        }
        return nil
}

func resumeCopy(ctx context.Context, w content.Writer, b []byte, _, size int64, dgst digest.Digest) error {
        r := struct {
                io.Reader
        }{bytes.NewReader(b)}
        if err := content.Copy(ctx, w, r, size, dgst); err != nil {
                return fmt.Errorf("copy failed: %w", err)
        }
        return nil
}

func resumeCopySeeker(ctx context.Context, w content.Writer, b []byte, _, size int64, dgst digest.Digest) error {
        r := struct {
                io.ReadSeeker
        }{bytes.NewReader(b)}
        if err := content.Copy(ctx, w, r, size, dgst); err != nil {
                return fmt.Errorf("copy failed: %w", err)
        }
        return nil
}

func resumeCopyReaderAt(ctx context.Context, w content.Writer, b []byte, _, size int64, dgst digest.Digest) error {
        type readerAt interface {
                io.Reader
                io.ReaderAt
        }
        r := struct {
                readerAt
        }{bytes.NewReader(b)}
        if err := content.Copy(ctx, w, r, size, dgst); err != nil {
                return fmt.Errorf("copy failed: %w", err)
        }
        return nil
}

// checkSmallBlob tests reading a blob which is smaller than the read size.
func checkSmallBlob(ctx context.Context, t *testing.T, store content.Store) {
        blob := []byte(`foobar`)
        blobSize := int64(len(blob))
        blobDigest := digest.FromBytes(blob)
        // test write
        w, err := store.Writer(ctx, content.WithRef(t.Name()), content.WithDescriptor(ocispec.Descriptor{Size: blobSize, Digest: blobDigest}))
        if err != nil {
                t.Fatal(err)
        }
        if _, err := w.Write(blob); err != nil {
                t.Fatal(err)
        }
        if err := w.Commit(ctx, blobSize, blobDigest); err != nil {
                t.Fatal(err)
        }
        if err := w.Close(); err != nil {
                t.Fatal(err)
        }
        // test read.
        readSize := blobSize + 1
        ra, err := store.ReaderAt(ctx, ocispec.Descriptor{Digest: blobDigest})
        if err != nil {
                t.Fatal(err)
        }
        defer ra.Close()
        r := io.NewSectionReader(ra, 0, readSize)
        b, err := io.ReadAll(r)
        if err != nil {
                t.Fatal(err)
        }
        if err := ra.Close(); err != nil {
                t.Fatal(err)
        }
        d := digest.FromBytes(b)
        if blobDigest != d {
                t.Fatalf("expected %s (%q), got %s (%q)", blobDigest, string(blob),
                        d, string(b))
        }
}

func checkCrossNSShare(ctx context.Context, t *testing.T, cs content.Store) {
        wrap, ok := ctx.Value(wrapperKey{}).(ContextWrapper)
        if !ok {
                t.Skip("multiple contexts not supported")
        }

        var size int64 = 1000
        b, d := createContent(size)
        ref := fmt.Sprintf("ref-%d", size)
        t1 := time.Now()

        if err := content.WriteBlob(ctx, cs, ref, bytes.NewReader(b), ocispec.Descriptor{Size: size, Digest: d}); err != nil {
                t.Fatal(err)
        }

        ctx2, done, err := wrap(context.Background(), false)
        if err != nil {
                t.Fatal(err)
        }
        defer done(ctx2)

        w, err := content.OpenWriter(ctx2, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: size, Digest: d}))
        if err != nil {
                t.Fatal(err)
        }
        defer w.Close()
        t2 := time.Now()

        checkStatus(t, w, content.Status{
                Ref:    ref,
                Offset: size,
                Total:  size,
        }, d, t1, t2, t1, t2)

        if err := w.Commit(ctx2, size, d); err != nil {
                t.Fatal(err)
        }
        t3 := time.Now()

        info := content.Info{
                Digest: d,
                Size:   size,
        }
        if err := checkContent(ctx, cs, d, info, t1, t3, t1, t3); err != nil {
                t.Fatal(err)
        }

        if err := checkContent(ctx2, cs, d, info, t1, t3, t1, t3); err != nil {
                t.Fatal(err)
        }
}

func checkCrossNSAppend(ctx context.Context, t *testing.T, cs content.Store) {
        wrap, ok := ctx.Value(wrapperKey{}).(ContextWrapper)
        if !ok {
                t.Skip("multiple contexts not supported")
        }

        var size int64 = 1000
        b, d := createContent(size)
        ref := fmt.Sprintf("ref-%d", size)
        t1 := time.Now()

        if err := content.WriteBlob(ctx, cs, ref, bytes.NewReader(b), ocispec.Descriptor{Size: size, Digest: d}); err != nil {
                t.Fatal(err)
        }

        ctx2, done, err := wrap(context.Background(), false)
        if err != nil {
                t.Fatal(err)
        }
        defer done(ctx2)

        extra := []byte("appended bytes")
        size2 := size + int64(len(extra))
        b2 := make([]byte, size2)
        copy(b2[:size], b)
        copy(b2[size:], extra)
        d2 := digest.FromBytes(b2)

        w, err := content.OpenWriter(ctx2, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: size, Digest: d}))
        if err != nil {
                t.Fatal(err)
        }
        defer w.Close()
        t2 := time.Now()

        checkStatus(t, w, content.Status{
                Ref:    ref,
                Offset: size,
                Total:  size,
        }, d, t1, t2, t1, t2)

        if _, err := w.Write(extra); err != nil {
                t.Fatal(err)
        }

        if err := w.Commit(ctx2, size2, d2); err != nil {
                t.Fatal(err)
        }
        t3 := time.Now()

        info := content.Info{
                Digest: d,
                Size:   size,
        }
        if err := checkContent(ctx, cs, d, info, t1, t3, t1, t3); err != nil {
                t.Fatal(err)
        }

        info2 := content.Info{
                Digest: d2,
                Size:   size2,
        }
        if err := checkContent(ctx2, cs, d2, info2, t1, t3, t1, t3); err != nil {
                t.Fatal(err)
        }

}

func checkCrossNSIsolate(ctx context.Context, t *testing.T, cs content.Store) {
        wrap, ok := ctx.Value(wrapperKey{}).(ContextWrapper)
        if !ok {
                t.Skip("multiple contexts not supported")
        }

        var size int64 = 1000
        b, d := createContent(size)
        ref := fmt.Sprintf("ref-%d", size)
        t1 := time.Now()

        if err := content.WriteBlob(ctx, cs, ref, bytes.NewReader(b), ocispec.Descriptor{Size: size, Digest: d}); err != nil {
                t.Fatal(err)
        }
        t2 := time.Now()

        ctx2, done, err := wrap(context.Background(), false)
        if err != nil {
                t.Fatal(err)
        }
        defer done(ctx2)

        t3 := time.Now()
        w, err := content.OpenWriter(ctx2, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: size, Digest: d}))
        if err != nil {
                t.Fatal(err)
        }
        defer w.Close()
        t4 := time.Now()

        checkNewlyCreated(t, w, t1, t2, t3, t4)
}

func checkSharedNSIsolate(ctx context.Context, t *testing.T, cs content.Store) {
        wrap, ok := ctx.Value(wrapperKey{}).(ContextWrapper)
        if !ok {
                t.Skip("multiple contexts not supported")
        }

        ctx1, done1, err := wrap(context.Background(), true)
        if err != nil {
                t.Fatal(err)
        }
        defer done1(ctx1)

        var size int64 = 1000
        b, d := createContent(size)
        ref := fmt.Sprintf("ref-%d", size)
        t1 := time.Now()

        if err := content.WriteBlob(ctx1, cs, ref, bytes.NewReader(b), ocispec.Descriptor{Size: size, Digest: d}); err != nil {
                t.Fatal(err)
        }

        ctx2, done2, err := wrap(context.Background(), false)
        if err != nil {
                t.Fatal(err)
        }
        defer done2(ctx2)

        w, err := content.OpenWriter(ctx2, cs, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{Size: size, Digest: d}))
        if err != nil {
                t.Fatal(err)
        }
        defer w.Close()
        t2 := time.Now()

        checkStatus(t, w, content.Status{
                Ref:    ref,
                Offset: size,
                Total:  size,
        }, d, t1, t2, t1, t2)

        if err := w.Commit(ctx2, size, d); err != nil {
                t.Fatal(err)
        }
        t3 := time.Now()

        info := content.Info{
                Digest: d,
                Size:   size,
        }
        if err := checkContent(ctx1, cs, d, info, t1, t3, t1, t3); err != nil {
                t.Fatal(err)
        }

        if err := checkContent(ctx2, cs, d, info, t1, t3, t1, t3); err != nil {
                t.Fatal(err)
        }
}

func checkStatus(t *testing.T, w content.Writer, expected content.Status, d digest.Digest, preStart, postStart, preUpdate, postUpdate time.Time) {
        t.Helper()
        st, err := w.Status()
        if err != nil {
                t.Fatalf("failed to get status: %v", err)
        }

        wd := w.Digest()
        if wd != d {
                t.Fatalf("unexpected digest %v, expected %v", wd, d)
        }

        if st.Ref != expected.Ref {
                t.Fatalf("unexpected ref %q, expected %q", st.Ref, expected.Ref)
        }

        if st.Offset != expected.Offset {
                t.Fatalf("unexpected offset %d, expected %d", st.Offset, expected.Offset)
        }

        if st.Total != expected.Total {
                t.Fatalf("unexpected total %d, expected %d", st.Total, expected.Total)
        }

        // TODO: Add this test once all implementations guarantee this value is held
        //if st.Expected != expected.Expected {
        //        t.Fatalf("unexpected \"expected digest\" %q, expected %q", st.Expected, expected.Expected)
        //}

        // FIXME: broken on windows: unexpected updated at time 2017-11-14 13:43:22.178013 -0800 PST,
        // expected between 2017-11-14 13:43:22.1790195 -0800 PST m=+1.022137300 and
        // 2017-11-14 13:43:22.1790195 -0800 PST m=+1.022137300
        if runtime.GOOS != "windows" {
                if st.StartedAt.After(postStart) || st.StartedAt.Before(preStart) {
                        t.Fatalf("unexpected started at time %s, expected between %s and %s", st.StartedAt, preStart, postStart)
                }

                t.Logf("compare update %v against (%v, %v)", st.UpdatedAt, preUpdate, postUpdate)
                if st.UpdatedAt.After(postUpdate) || st.UpdatedAt.Before(preUpdate) {
                        t.Fatalf("unexpected updated at time %s, expected between %s and %s", st.UpdatedAt, preUpdate, postUpdate)
                }
        }
}

func checkNewlyCreated(t *testing.T, w content.Writer, preStart, postStart, preUpdate, postUpdate time.Time) {
        t.Helper()
        st, err := w.Status()
        if err != nil {
                t.Fatalf("failed to get status: %v", err)
        }

        wd := w.Digest()
        if wd != emptyDigest {
                t.Fatalf("unexpected digest %v, expected %v", wd, emptyDigest)
        }

        if st.Offset != 0 {
                t.Fatalf("unexpected offset %v", st.Offset)
        }

        if runtime.GOOS != "windows" {
                if st.StartedAt.After(postUpdate) || st.StartedAt.Before(postStart) {
                        t.Fatalf("unexpected started at time %s, expected between %s and %s", st.StartedAt, postStart, postUpdate)
                }
        }
}

func checkInfo(ctx context.Context, cs content.Store, d digest.Digest, expected content.Info, c1, c2, u1, u2 time.Time) error {
        info, err := cs.Info(ctx, d)
        if err != nil {
                return fmt.Errorf("failed to get info: %w", err)
        }

        if info.Digest != d {
                return fmt.Errorf("unexpected info digest %s, expected %s", info.Digest, d)
        }

        if info.Size != expected.Size {
                return fmt.Errorf("unexpected info size %d, expected %d", info.Size, expected.Size)
        }

        if info.CreatedAt.After(c2) || info.CreatedAt.Before(c1) {
                return fmt.Errorf("unexpected created at time %s, expected between %s and %s", info.CreatedAt, c1, c2)
        }
        // FIXME: broken on windows: unexpected updated at time 2017-11-14 13:43:22.178013 -0800 PST,
        // expected between 2017-11-14 13:43:22.1790195 -0800 PST m=+1.022137300 and
        // 2017-11-14 13:43:22.1790195 -0800 PST m=+1.022137300
        if runtime.GOOS != "windows" && (info.UpdatedAt.After(u2) || info.UpdatedAt.Before(u1)) {
                return fmt.Errorf("unexpected updated at time %s, expected between %s and %s", info.UpdatedAt, u1, u2)
        }

        if len(info.Labels) != len(expected.Labels) {
                return fmt.Errorf("mismatched number of labels\ngot:\n%#v\nexpected:\n%#v", info.Labels, expected.Labels)
        }

        for k, v := range expected.Labels {
                actual := info.Labels[k]
                if v != actual {
                        return fmt.Errorf("unexpected value for label %q: %q, expected %q", k, actual, v)
                }
        }

        return nil
}
func checkContent(ctx context.Context, cs content.Store, d digest.Digest, expected content.Info, c1, c2, u1, u2 time.Time) error {
        if err := checkInfo(ctx, cs, d, expected, c1, c2, u1, u2); err != nil {
                return err
        }

        b, err := content.ReadBlob(ctx, cs, ocispec.Descriptor{Digest: d})
        if err != nil {
                return fmt.Errorf("failed to read blob: %w", err)
        }

        if int64(len(b)) != expected.Size {
                return fmt.Errorf("wrong blob size %d, expected %d", len(b), expected.Size)
        }

        actual := digest.FromBytes(b)
        if actual != d {
                return fmt.Errorf("wrong digest %s, expected %s", actual, d)
        }

        return nil
}

var contentSeed int64

func createContent(size int64) ([]byte, digest.Digest) {
        // each time we call this, we want to get a different seed, but it should
        // be related to the initialization order and fairly consistent between
        // test runs. An atomic integer works just good enough for this.
        seed := atomic.AddInt64(&contentSeed, 1)

        b, err := io.ReadAll(io.LimitReader(rand.New(rand.NewSource(seed)), size))
        if err != nil {
                panic(err)
        }
        return b, digest.FromBytes(b)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package apply

import (
        "context"
        "fmt"
        "io"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// NewFileSystemApplier returns an applier which simply mounts
// and applies diff onto the mounted filesystem.
func NewFileSystemApplier(cs content.Provider) diff.Applier {
        return &fsApplier{
                store: cs,
        }
}

type fsApplier struct {
        store content.Provider
}

var emptyDesc = ocispec.Descriptor{}

// Apply applies the content associated with the provided digests onto the
// provided mounts. Archive content will be extracted and decompressed if
// necessary.
func (s *fsApplier) Apply(ctx context.Context, desc ocispec.Descriptor, mounts []mount.Mount, opts ...diff.ApplyOpt) (d ocispec.Descriptor, err error) {
        t1 := time.Now()
        defer func() {
                if err == nil {
                        log.G(ctx).WithFields(log.Fields{
                                "d":      time.Since(t1),
                                "digest": desc.Digest,
                                "size":   desc.Size,
                                "media":  desc.MediaType,
                        }).Debugf("diff applied")
                }
        }()

        var config diff.ApplyConfig
        for _, o := range opts {
                if err := o(ctx, desc, &config); err != nil {
                        return emptyDesc, fmt.Errorf("failed to apply config opt: %w", err)
                }
        }

        ra, err := s.store.ReaderAt(ctx, desc)
        if err != nil {
                return emptyDesc, fmt.Errorf("failed to get reader from content store: %w", err)
        }
        defer ra.Close()

        var processors []diff.StreamProcessor
        processor := diff.NewProcessorChain(desc.MediaType, content.NewReader(ra))
        processors = append(processors, processor)
        for {
                if processor, err = diff.GetProcessor(ctx, processor, config.ProcessorPayloads); err != nil {
                        return emptyDesc, fmt.Errorf("failed to get stream processor for %s: %w", desc.MediaType, err)
                }
                processors = append(processors, processor)
                if processor.MediaType() == ocispec.MediaTypeImageLayer {
                        break
                }
        }
        defer processor.Close()

        digester := digest.Canonical.Digester()
        rc := &readCounter{
                r: io.TeeReader(processor, digester.Hash()),
        }

        if err := apply(ctx, mounts, rc, config.SyncFs); err != nil {
                return emptyDesc, err
        }

        // Read any trailing data
        if _, err := io.Copy(io.Discard, rc); err != nil {
                return emptyDesc, err
        }

        for _, p := range processors {
                if ep, ok := p.(interface {
                        Err() error
                }); ok {
                        if err := ep.Err(); err != nil {
                                return emptyDesc, err
                        }
                }
        }
        return ocispec.Descriptor{
                MediaType: ocispec.MediaTypeImageLayer,
                Size:      rc.c,
                Digest:    digester.Digest(),
        }, nil
}

type readCounter struct {
        r io.Reader
        c int64
}

func (rc *readCounter) Read(p []byte) (n int, err error) {
        n, err = rc.r.Read(p)
        if n > 0 {
                rc.c += int64(n)
        }
        return
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package apply

import (
        "context"
        "fmt"
        "io"
        "os"
        "strings"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/archive"
        "github.com/containerd/containerd/v2/pkg/userns"
        "github.com/containerd/errdefs"

        "golang.org/x/sys/unix"
)

func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, sync bool) (retErr error) {
        switch {
        case len(mounts) == 1 && mounts[0].Type == "overlay":
                // OverlayConvertWhiteout (mknod c 0 0) doesn't work in userns.
                // https://github.com/containerd/containerd/issues/3762
                if userns.RunningInUserNS() {
                        break
                }
                path, parents, err := getOverlayPath(mounts[0].Options)
                if err != nil {
                        if errdefs.IsInvalidArgument(err) {
                                break
                        }
                        return err
                }
                opts := []archive.ApplyOpt{
                        archive.WithConvertWhiteout(archive.OverlayConvertWhiteout),
                }
                if len(parents) > 0 {
                        opts = append(opts, archive.WithParents(parents))
                }
                _, err = archive.Apply(ctx, path, r, opts...)
                if err == nil && sync {
                        err = doSyncFs(path)
                }
                return err
        case sync && len(mounts) == 1 && mounts[0].Type == "bind":
                defer func() {
                        if retErr != nil {
                                return
                        }

                        retErr = doSyncFs(mounts[0].Source)
                }()
        }
        return mount.WithTempMount(ctx, mounts, func(root string) error {
                _, err := archive.Apply(ctx, root, r)
                return err
        })
}

func getOverlayPath(options []string) (upper string, lower []string, err error) {
        const upperdirPrefix = "upperdir="
        const lowerdirPrefix = "lowerdir="

        for _, o := range options {
                if strings.HasPrefix(o, upperdirPrefix) {
                        upper = strings.TrimPrefix(o, upperdirPrefix)
                } else if strings.HasPrefix(o, lowerdirPrefix) {
                        lower = strings.Split(strings.TrimPrefix(o, lowerdirPrefix), ":")
                }
        }
        if upper == "" {
                return "", nil, fmt.Errorf("upperdir not found: %w", errdefs.ErrInvalidArgument)
        }

        return
}

func doSyncFs(file string) error {
        fd, err := os.Open(file)
        if err != nil {
                return fmt.Errorf("failed to open %s: %w", file, err)
        }
        defer fd.Close()

        err = unix.Syncfs(int(fd.Fd()))
        if err != nil {
                return fmt.Errorf("failed to syncfs for %s: %w", file, err)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package diff

import (
        "context"
        "io"
        "time"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Config is used to hold parameters needed for a diff operation
type Config struct {
        // MediaType is the type of diff to generate
        // Default depends on the differ,
        // i.e. application/vnd.oci.image.layer.v1.tar+gzip
        MediaType string

        // Reference is the content upload reference
        // Default will use a random reference string
        Reference string

        // Labels are the labels to apply to the generated content
        Labels map[string]string

        // Compressor is a function to compress the diff stream
        // instead of the default gzip compressor. Differ passes
        // the MediaType of the target diff content to the compressor.
        // When using this config, MediaType must be specified as well.
        Compressor func(dest io.Writer, mediaType string) (io.WriteCloser, error)

        // SourceDateEpoch specifies the SOURCE_DATE_EPOCH without touching the env vars.
        SourceDateEpoch *time.Time
}

// Opt is used to configure a diff operation
type Opt func(*Config) error

// Comparer allows creation of filesystem diffs between mounts
type Comparer interface {
        // Compare computes the difference between two mounts and returns a
        // descriptor for the computed diff. The options can provide
        // a ref which can be used to track the content creation of the diff.
        // The media type which is used to determine the format of the created
        // content can also be provided as an option.
        Compare(ctx context.Context, lower, upper []mount.Mount, opts ...Opt) (ocispec.Descriptor, error)
}

// ApplyConfig is used to hold parameters needed for a apply operation
type ApplyConfig struct {
        // ProcessorPayloads specifies the payload sent to various processors
        ProcessorPayloads map[string]typeurl.Any
        // SyncFs is to synchronize the underlying filesystem containing files
        SyncFs bool
}

// ApplyOpt is used to configure an Apply operation
type ApplyOpt func(context.Context, ocispec.Descriptor, *ApplyConfig) error

// Applier allows applying diffs between mounts
type Applier interface {
        // Apply applies the content referred to by the given descriptor to
        // the provided mount. The method of applying is based on the
        // implementation and content descriptor. For example, in the common
        // case the descriptor is a file system difference in tar format,
        // that tar would be applied on top of the mounts.
        Apply(ctx context.Context, desc ocispec.Descriptor, mount []mount.Mount, opts ...ApplyOpt) (ocispec.Descriptor, error)
}

// WithCompressor sets the function to be used to compress the diff stream.
func WithCompressor(f func(dest io.Writer, mediaType string) (io.WriteCloser, error)) Opt {
        return func(c *Config) error {
                c.Compressor = f
                return nil
        }
}

// WithMediaType sets the media type to use for creating the diff, without
// specifying the differ will choose a default.
func WithMediaType(m string) Opt {
        return func(c *Config) error {
                c.MediaType = m
                return nil
        }
}

// WithReference is used to set the content upload reference used by
// the diff operation. This allows the caller to track the upload through
// the content store.
func WithReference(ref string) Opt {
        return func(c *Config) error {
                c.Reference = ref
                return nil
        }
}

// WithLabels is used to set content labels on the created diff content.
func WithLabels(labels map[string]string) Opt {
        return func(c *Config) error {
                c.Labels = labels
                return nil
        }
}

// WithPayloads sets the apply processor payloads to the config
func WithPayloads(payloads map[string]typeurl.Any) ApplyOpt {
        return func(_ context.Context, _ ocispec.Descriptor, c *ApplyConfig) error {
                c.ProcessorPayloads = payloads
                return nil
        }
}

// WithSyncFs sets sync flag to the config.
func WithSyncFs(sync bool) ApplyOpt {
        return func(_ context.Context, _ ocispec.Descriptor, c *ApplyConfig) error {
                c.SyncFs = sync
                return nil
        }
}

// WithSourceDateEpoch specifies the timestamp used to provide control for reproducibility.
// See also https://reproducible-builds.org/docs/source-date-epoch/ .
//
// Since containerd v2.0, the whiteout timestamps are set to zero (1970-01-01),
// not to the source date epoch.
func WithSourceDateEpoch(tm *time.Time) Opt {
        return func(c *Config) error {
                c.SourceDateEpoch = tm
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"

        diffapi "github.com/containerd/containerd/api/services/diff/v1"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/epoch"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "google.golang.org/protobuf/types/known/timestamppb"
)

// NewDiffApplier returns a new comparer and applier which communicates
// over a GRPC connection.
func NewDiffApplier(client diffapi.DiffClient) interface{} {
        return &diffRemote{
                client: client,
        }
}

type diffRemote struct {
        client diffapi.DiffClient
}

func (r *diffRemote) Apply(ctx context.Context, desc ocispec.Descriptor, mounts []mount.Mount, opts ...diff.ApplyOpt) (ocispec.Descriptor, error) {
        var config diff.ApplyConfig
        for _, opt := range opts {
                if err := opt(ctx, desc, &config); err != nil {
                        return ocispec.Descriptor{}, err
                }
        }

        payloads := make(map[string]*ptypes.Any)
        for k, v := range config.ProcessorPayloads {
                payloads[k] = protobuf.FromAny(v)
        }

        req := &diffapi.ApplyRequest{
                Diff:     oci.DescriptorToProto(desc),
                Mounts:   mount.ToProto(mounts),
                Payloads: payloads,
                SyncFs:   config.SyncFs,
        }
        resp, err := r.client.Apply(ctx, req)
        if err != nil {
                return ocispec.Descriptor{}, errdefs.FromGRPC(err)
        }
        return oci.DescriptorFromProto(resp.Applied), nil
}

func (r *diffRemote) Compare(ctx context.Context, a, b []mount.Mount, opts ...diff.Opt) (ocispec.Descriptor, error) {
        var config diff.Config
        for _, opt := range opts {
                if err := opt(&config); err != nil {
                        return ocispec.Descriptor{}, err
                }
        }
        if tm := epoch.FromContext(ctx); tm != nil && config.SourceDateEpoch == nil {
                config.SourceDateEpoch = tm
        }
        var sourceDateEpoch *timestamppb.Timestamp
        if config.SourceDateEpoch != nil {
                sourceDateEpoch = timestamppb.New(*config.SourceDateEpoch)
        }
        req := &diffapi.DiffRequest{
                Left:            mount.ToProto(a),
                Right:           mount.ToProto(b),
                MediaType:       config.MediaType,
                Ref:             config.Reference,
                Labels:          config.Labels,
                SourceDateEpoch: sourceDateEpoch,
        }
        resp, err := r.client.Diff(ctx, req)
        if err != nil {
                return ocispec.Descriptor{}, errdefs.FromGRPC(err)
        }
        return oci.DescriptorFromProto(resp.Diff), nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package diff

import (
        "context"
        "errors"
        "io"
        "os"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

var (
        handlers []Handler

        // ErrNoProcessor is returned when no stream processor is available for a media-type
        ErrNoProcessor = errors.New("no processor for media-type")
)

func init() {
        // register the default compression handler
        RegisterProcessor(compressedHandler)
}

// RegisterProcessor registers a stream processor for media-types
func RegisterProcessor(handler Handler) {
        handlers = append(handlers, handler)
}

// GetProcessor returns the processor for a media-type
func GetProcessor(ctx context.Context, stream StreamProcessor, payloads map[string]typeurl.Any) (StreamProcessor, error) {
        // reverse this list so that user configured handlers come up first
        for i := len(handlers) - 1; i >= 0; i-- {
                processor, ok := handlers[i](ctx, stream.MediaType())
                if ok {
                        return processor(ctx, stream, payloads)
                }
        }
        return nil, ErrNoProcessor
}

// Handler checks a media-type and initializes the processor
type Handler func(ctx context.Context, mediaType string) (StreamProcessorInit, bool)

// StaticHandler returns the processor init func for a static media-type
func StaticHandler(expectedMediaType string, fn StreamProcessorInit) Handler {
        return func(ctx context.Context, mediaType string) (StreamProcessorInit, bool) {
                if mediaType == expectedMediaType {
                        return fn, true
                }
                return nil, false
        }
}

// StreamProcessorInit returns the initialized stream processor
type StreamProcessorInit func(ctx context.Context, stream StreamProcessor, payloads map[string]typeurl.Any) (StreamProcessor, error)

// RawProcessor provides access to direct fd for processing
type RawProcessor interface {
        // File returns the fd for the read stream of the underlying processor
        File() *os.File
}

// StreamProcessor handles processing a content stream and transforming it into a different media-type
type StreamProcessor interface {
        io.ReadCloser

        // MediaType is the resulting media-type that the processor processes the stream into
        MediaType() string
}

func compressedHandler(ctx context.Context, mediaType string) (StreamProcessorInit, bool) {
        compressed, err := images.DiffCompression(ctx, mediaType)
        if err != nil {
                return nil, false
        }
        if compressed != "" {
                return func(ctx context.Context, stream StreamProcessor, payloads map[string]typeurl.Any) (StreamProcessor, error) {
                        ds, err := compression.DecompressStream(stream)
                        if err != nil {
                                return nil, err
                        }

                        return &compressedProcessor{
                                rc: ds,
                        }, nil
                }, true
        }
        return func(ctx context.Context, stream StreamProcessor, payloads map[string]typeurl.Any) (StreamProcessor, error) {
                return &stdProcessor{
                        rc: stream,
                }, nil
        }, true
}

// NewProcessorChain initialized the root StreamProcessor
func NewProcessorChain(mt string, r io.Reader) StreamProcessor {
        return &processorChain{
                mt: mt,
                rc: r,
        }
}

type processorChain struct {
        mt string
        rc io.Reader
}

func (c *processorChain) MediaType() string {
        return c.mt
}

func (c *processorChain) Read(p []byte) (int, error) {
        return c.rc.Read(p)
}

func (c *processorChain) Close() error {
        return nil
}

type stdProcessor struct {
        rc StreamProcessor
}

func (c *stdProcessor) MediaType() string {
        return ocispec.MediaTypeImageLayer
}

func (c *stdProcessor) Read(p []byte) (int, error) {
        return c.rc.Read(p)
}

func (c *stdProcessor) Close() error {
        return nil
}

type compressedProcessor struct {
        rc io.ReadCloser
}

func (c *compressedProcessor) MediaType() string {
        return ocispec.MediaTypeImageLayer
}

func (c *compressedProcessor) Read(p []byte) (int, error) {
        return c.rc.Read(p)
}

func (c *compressedProcessor) Close() error {
        return c.rc.Close()
}

// BinaryHandler creates a new stream processor handler which calls out to the given binary.
// The id is used to identify the stream processor and allows the caller to send
// payloads specific for that stream processor (i.e. decryption keys for decrypt stream processor).
// The binary will be called for the provided mediaTypes and return the given media type.
func BinaryHandler(id, returnsMediaType string, mediaTypes []string, path string, args, env []string) Handler {
        set := make(map[string]struct{}, len(mediaTypes))
        for _, m := range mediaTypes {
                set[m] = struct{}{}
        }
        return func(_ context.Context, mediaType string) (StreamProcessorInit, bool) {
                if _, ok := set[mediaType]; ok {
                        return func(ctx context.Context, stream StreamProcessor, payloads map[string]typeurl.Any) (StreamProcessor, error) {
                                payload := payloads[id]
                                return NewBinaryProcessor(ctx, mediaType, returnsMediaType, stream, path, args, env, payload)
                        }, true
                }
                return nil, false
        }
}

const mediaTypeEnvVar = "STREAM_PROCESSOR_MEDIATYPE"

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package diff

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "os/exec"
        "sync"

        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/typeurl/v2"
)

// NewBinaryProcessor returns a binary processor for use with processing content streams
func NewBinaryProcessor(ctx context.Context, imt, rmt string, stream StreamProcessor, name string, args, env []string, payload typeurl.Any) (StreamProcessor, error) {
        cmd := exec.CommandContext(ctx, name, args...)
        cmd.Env = os.Environ()
        cmd.Env = append(cmd.Env, env...)

        var payloadC io.Closer
        if payload != nil {
                pb := protobuf.FromAny(payload)
                data, err := proto.Marshal(pb)
                if err != nil {
                        return nil, err
                }
                r, w, err := os.Pipe()
                if err != nil {
                        return nil, err
                }
                go func() {
                        io.Copy(w, bytes.NewReader(data))
                        w.Close()
                }()

                cmd.ExtraFiles = append(cmd.ExtraFiles, r)
                payloadC = r
        }
        cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", mediaTypeEnvVar, imt))
        var (
                stdin  io.Reader
                closer func() error
                err    error
        )
        if f, ok := stream.(RawProcessor); ok {
                stdin = f.File()
                closer = f.File().Close
        } else {
                stdin = stream
        }
        cmd.Stdin = stdin
        r, w, err := os.Pipe()
        if err != nil {
                return nil, err
        }
        cmd.Stdout = w

        stderr := bytes.NewBuffer(nil)
        cmd.Stderr = stderr

        if err := cmd.Start(); err != nil {
                return nil, err
        }
        p := &binaryProcessor{
                cmd:    cmd,
                r:      r,
                mt:     rmt,
                stderr: stderr,
                done:   make(chan struct{}),
        }
        go p.wait()

        // close after start and dup
        w.Close()
        if closer != nil {
                closer()
        }
        if payloadC != nil {
                payloadC.Close()
        }
        return p, nil
}

type binaryProcessor struct {
        cmd    *exec.Cmd
        r      *os.File
        mt     string
        stderr *bytes.Buffer

        mu  sync.Mutex
        err error

        // There is a race condition between waiting on c.cmd.Wait() and setting c.err within
        // c.wait(), and reading that value from c.Err().
        // Use done to wait for the returned error to be captured and set.
        done chan struct{}
}

func (c *binaryProcessor) Err() error {
        c.mu.Lock()
        defer c.mu.Unlock()
        return c.err
}

func (c *binaryProcessor) wait() {
        if err := c.cmd.Wait(); err != nil {
                if _, ok := err.(*exec.ExitError); ok {
                        c.mu.Lock()
                        c.err = errors.New(c.stderr.String())
                        c.mu.Unlock()
                }
        }
        close(c.done)
}

func (c *binaryProcessor) Wait(ctx context.Context) error {
        select {
        case <-c.done:
                return c.Err()
        case <-ctx.Done():
                return ctx.Err()
        }
}

func (c *binaryProcessor) File() *os.File {
        return c.r
}

func (c *binaryProcessor) MediaType() string {
        return c.mt
}

func (c *binaryProcessor) Read(p []byte) (int, error) {
        return c.r.Read(p)
}

func (c *binaryProcessor) Close() error {
        err := c.r.Close()
        if kerr := c.cmd.Process.Kill(); err == nil {
                err = kerr
        }
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package events

import (
        "context"
        "time"

        "github.com/containerd/typeurl/v2"
)

// Envelope provides the packaging for an event.
type Envelope struct {
        Timestamp time.Time
        Namespace string
        Topic     string
        Event     typeurl.Any
}

// Field returns the value for the given fieldpath as a string, if defined.
// If the value is not defined, the second value will be false.
func (e *Envelope) Field(fieldpath []string) (string, bool) {
        if len(fieldpath) == 0 {
                return "", false
        }

        switch fieldpath[0] {
        // unhandled: timestamp
        case "namespace":
                return e.Namespace, len(e.Namespace) > 0
        case "topic":
                return e.Topic, len(e.Topic) > 0
        case "event":
                decoded, err := typeurl.UnmarshalAny(e.Event)
                if err != nil {
                        return "", false
                }

                adaptor, ok := decoded.(interface {
                        Field([]string) (string, bool)
                })
                if !ok {
                        return "", false
                }
                return adaptor.Field(fieldpath[1:])
        }
        return "", false
}

// Event is a generic interface for any type of event
type Event interface{}

// Publisher posts the event.
type Publisher interface {
        Publish(ctx context.Context, topic string, event Event) error
}

// Forwarder forwards an event to the underlying event bus
type Forwarder interface {
        Forward(ctx context.Context, envelope *Envelope) error
}

// Subscriber allows callers to subscribe to events
type Subscriber interface {
        Subscribe(ctx context.Context, filters ...string) (ch <-chan *Envelope, errs <-chan error)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package exchange

import (
        "context"
        "fmt"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/identifiers"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        goevents "github.com/docker/go-events"
)

// Exchange broadcasts events
type Exchange struct {
        broadcaster *goevents.Broadcaster
}

// NewExchange returns a new event Exchange
func NewExchange() *Exchange {
        return &Exchange{
                broadcaster: goevents.NewBroadcaster(),
        }
}

var _ events.Publisher = &Exchange{}
var _ events.Forwarder = &Exchange{}
var _ events.Subscriber = &Exchange{}

// Forward accepts an envelope to be directly distributed on the exchange.
//
// This is useful when an event is forwarded on behalf of another namespace or
// when the event is propagated on behalf of another publisher.
func (e *Exchange) Forward(ctx context.Context, envelope *events.Envelope) (err error) {
        if err := validateEnvelope(envelope); err != nil {
                return err
        }

        defer func() {
                logger := log.G(ctx).WithFields(log.Fields{
                        "topic": envelope.Topic,
                        "ns":    envelope.Namespace,
                        "type":  envelope.Event.GetTypeUrl(),
                })

                if err != nil {
                        logger.WithError(err).Error("error forwarding event")
                } else {
                        logger.Trace("event forwarded")
                }
        }()

        return e.broadcaster.Write(envelope)
}

// Publish packages and sends an event. The caller will be considered the
// initial publisher of the event. This means the timestamp will be calculated
// at this point and this method may read from the calling context.
func (e *Exchange) Publish(ctx context.Context, topic string, event events.Event) (err error) {
        var (
                namespace string
                envelope  events.Envelope
        )

        namespace, err = namespaces.NamespaceRequired(ctx)
        if err != nil {
                return fmt.Errorf("failed publishing event: %w", err)
        }
        if err := validateTopic(topic); err != nil {
                return fmt.Errorf("envelope topic %q: %w", topic, err)
        }

        encoded, err := typeurl.MarshalAny(event)
        if err != nil {
                return err
        }

        envelope.Timestamp = time.Now().UTC()
        envelope.Namespace = namespace
        envelope.Topic = topic
        envelope.Event = encoded

        defer func() {
                logger := log.G(ctx).WithFields(log.Fields{
                        "topic": envelope.Topic,
                        "ns":    envelope.Namespace,
                        "type":  envelope.Event.GetTypeUrl(),
                })

                if err != nil {
                        logger.WithError(err).Error("error publishing event")
                } else {
                        logger.Trace("event published")
                }
        }()

        return e.broadcaster.Write(&envelope)
}

// Subscribe to events on the exchange. Events are sent through the returned
// channel ch. If an error is encountered, it will be sent on channel errs and
// errs will be closed. To end the subscription, cancel the provided context.
//
// Zero or more filters may be provided as strings. Only events that match
// *any* of the provided filters will be sent on the channel. The filters use
// the standard containerd filters package syntax.
func (e *Exchange) Subscribe(ctx context.Context, fs ...string) (ch <-chan *events.Envelope, errs <-chan error) {
        var (
                evch                  = make(chan *events.Envelope)
                errq                  = make(chan error, 1)
                channel               = goevents.NewChannel(0)
                queue                 = goevents.NewQueue(channel)
                dst     goevents.Sink = queue
        )

        closeAll := func() {
                channel.Close()
                queue.Close()
                e.broadcaster.Remove(dst)
                close(errq)
        }

        ch = evch
        errs = errq

        if len(fs) > 0 {
                filter, err := filters.ParseAll(fs...)
                if err != nil {
                        errq <- fmt.Errorf("failed parsing subscription filters: %w", err)
                        closeAll()
                        return
                }

                dst = goevents.NewFilter(queue, goevents.MatcherFunc(func(gev goevents.Event) bool {
                        return filter.Match(adapt(gev))
                }))
        }

        e.broadcaster.Add(dst)

        go func() {
                defer closeAll()

                var err error
        loop:
                for {
                        select {
                        case ev := <-channel.C:
                                env, ok := ev.(*events.Envelope)
                                if !ok {
                                        // TODO(stevvooe): For the most part, we are well protected
                                        // from this condition. Both Forward and Publish protect
                                        // from this.
                                        err = fmt.Errorf("invalid envelope encountered %#v; please file a bug", ev)
                                        break
                                }

                                select {
                                case evch <- env:
                                case <-ctx.Done():
                                        break loop
                                }
                        case <-ctx.Done():
                                break loop
                        }
                }

                if err == nil {
                        if cerr := ctx.Err(); cerr != context.Canceled {
                                err = cerr
                        }
                }

                errq <- err
        }()

        return
}

func validateTopic(topic string) error {
        if topic == "" {
                return fmt.Errorf("must not be empty: %w", errdefs.ErrInvalidArgument)
        }

        if topic[0] != '/' {
                return fmt.Errorf("must start with '/': %w", errdefs.ErrInvalidArgument)
        }

        if len(topic) == 1 {
                return fmt.Errorf("must have at least one component: %w", errdefs.ErrInvalidArgument)
        }

        components := strings.Split(topic[1:], "/")
        for _, component := range components {
                if err := identifiers.Validate(component); err != nil {
                        return fmt.Errorf("failed validation on component %q: %w", component, err)
                }
        }

        return nil
}

func validateEnvelope(envelope *events.Envelope) error {
        if err := identifiers.Validate(envelope.Namespace); err != nil {
                return fmt.Errorf("event envelope has invalid namespace: %w", err)
        }

        if err := validateTopic(envelope.Topic); err != nil {
                return fmt.Errorf("envelope topic %q: %w", envelope.Topic, err)
        }

        if envelope.Timestamp.IsZero() {
                return fmt.Errorf("timestamp must be set on forwarded event: %w", errdefs.ErrInvalidArgument)
        }

        return nil
}

func adapt(ev interface{}) filters.Adaptor {
        if adaptor, ok := ev.(filters.Adaptor); ok {
                return adaptor
        }

        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                return "", false
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"
        "fmt"

        api "github.com/containerd/containerd/api/services/events/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/ttrpc"
        "github.com/containerd/typeurl/v2"
        "google.golang.org/grpc"
)

type EventService interface {
        events.Publisher
        events.Forwarder
        events.Subscriber
}

func NewRemoteEvents(client any) EventService {
        switch c := client.(type) {
        case api.EventsClient:
                return &grpcEventsProxy{
                        client: c,
                }
        case api.TTRPCEventsClient:
                return &ttrpcEventsProxy{
                        client: c,
                }
        case grpc.ClientConnInterface:
                return &grpcEventsProxy{
                        client: api.NewEventsClient(c),
                }
        case *ttrpc.Client:
                return &ttrpcEventsProxy{
                        client: api.NewTTRPCEventsClient(c),
                }
        default:
                panic(fmt.Errorf("unsupported events client %T: %w", client, errdefs.ErrNotImplemented))
        }
}

type grpcEventsProxy struct {
        client api.EventsClient
}

func (p *grpcEventsProxy) Publish(ctx context.Context, topic string, event events.Event) error {
        evt, err := typeurl.MarshalAny(event)
        if err != nil {
                return err
        }
        req := &api.PublishRequest{
                Topic: topic,
                Event: protobuf.FromAny(evt),
        }
        if _, err := p.client.Publish(ctx, req); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (p *grpcEventsProxy) Forward(ctx context.Context, envelope *events.Envelope) error {
        req := &api.ForwardRequest{
                Envelope: &types.Envelope{
                        Timestamp: protobuf.ToTimestamp(envelope.Timestamp),
                        Namespace: envelope.Namespace,
                        Topic:     envelope.Topic,
                        Event:     protobuf.FromAny(envelope.Event),
                },
        }
        if _, err := p.client.Forward(ctx, req); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (p *grpcEventsProxy) Subscribe(ctx context.Context, filters ...string) (ch <-chan *events.Envelope, errs <-chan error) {
        var (
                evq  = make(chan *events.Envelope)
                errq = make(chan error, 1)
        )

        errs = errq
        ch = evq

        session, err := p.client.Subscribe(ctx, &api.SubscribeRequest{
                Filters: filters,
        })
        if err != nil {
                errq <- err
                close(errq)
                return
        }

        go func() {
                defer close(errq)

                for {
                        ev, err := session.Recv()
                        if err != nil {
                                errq <- err
                                return
                        }

                        select {
                        case evq <- &events.Envelope{
                                Timestamp: protobuf.FromTimestamp(ev.Timestamp),
                                Namespace: ev.Namespace,
                                Topic:     ev.Topic,
                                Event:     ev.Event,
                        }:
                        case <-ctx.Done():
                                if cerr := ctx.Err(); cerr != context.Canceled {
                                        errq <- cerr
                                }
                                return
                        }
                }
        }()

        return ch, errs
}

type ttrpcEventsProxy struct {
        client api.TTRPCEventsClient
}

func (p *ttrpcEventsProxy) Publish(ctx context.Context, topic string, event events.Event) error {
        evt, err := typeurl.MarshalAny(event)
        if err != nil {
                return err
        }
        req := &api.PublishRequest{
                Topic: topic,
                Event: protobuf.FromAny(evt),
        }
        if _, err := p.client.Publish(ctx, req); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (p *ttrpcEventsProxy) Forward(ctx context.Context, envelope *events.Envelope) error {
        req := &api.ForwardRequest{
                Envelope: &types.Envelope{
                        Timestamp: protobuf.ToTimestamp(envelope.Timestamp),
                        Namespace: envelope.Namespace,
                        Topic:     envelope.Topic,
                        Event:     protobuf.FromAny(envelope.Event),
                },
        }
        if _, err := p.client.Forward(ctx, req); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (p *ttrpcEventsProxy) Subscribe(ctx context.Context, filters ...string) (ch <-chan *events.Envelope, errs <-chan error) {
        var (
                evq  = make(chan *events.Envelope)
                errq = make(chan error, 1)
        )

        errs = errq
        ch = evq

        session, err := p.client.Subscribe(ctx, &api.SubscribeRequest{
                Filters: filters,
        })
        if err != nil {
                errq <- err
                close(errq)
                return
        }

        go func() {
                defer close(errq)

                for {
                        ev, err := session.Recv()
                        if err != nil {
                                errq <- err
                                return
                        }

                        select {
                        case evq <- &events.Envelope{
                                Timestamp: protobuf.FromTimestamp(ev.Timestamp),
                                Namespace: ev.Namespace,
                                Topic:     ev.Topic,
                                Event:     ev.Event,
                        }:
                        case <-ctx.Done():
                                if cerr := ctx.Err(); cerr != context.Canceled {
                                        errq <- cerr
                                }
                                return
                        }
                }
        }()

        return ch, errs
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "archive/tar"
        "context"
        "encoding/json"
        "fmt"
        "io"
        "path"
        "sort"
        "strings"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        digest "github.com/opencontainers/go-digest"
        ocispecs "github.com/opencontainers/image-spec/specs-go"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type exportOptions struct {
        manifests          []ocispec.Descriptor
        platform           platforms.MatchComparer
        allPlatforms       bool
        skipDockerManifest bool
        blobRecordOptions  blobRecordOptions
}

// ExportOpt defines options for configuring exported descriptors
type ExportOpt func(context.Context, *exportOptions) error

// WithPlatform defines the platform to require manifest lists have
// not exporting all platforms.
// Additionally, platform is used to resolve image configs for
// Docker v1.1, v1.2 format compatibility.
func WithPlatform(p platforms.MatchComparer) ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                o.platform = p
                return nil
        }
}

// WithAllPlatforms exports all manifests from a manifest list.
// Missing content will fail the export.
func WithAllPlatforms() ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                o.allPlatforms = true
                return nil
        }
}

// WithSkipDockerManifest skips creation of the Docker compatible
// manifest.json file.
func WithSkipDockerManifest() ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                o.skipDockerManifest = true
                return nil
        }
}

// WithImage adds the provided images to the exported archive.
func WithImage(is images.Store, name string) ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                img, err := is.Get(ctx, name)
                if err != nil {
                        return err
                }

                img.Target.Annotations = addNameAnnotation(name, img.Target.Annotations)
                o.manifests = append(o.manifests, img.Target)

                return nil
        }
}

// WithImages adds multiples images to the exported archive.
func WithImages(imgs []images.Image) ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                for _, img := range imgs {
                        img.Target.Annotations = addNameAnnotation(img.Name, img.Target.Annotations)
                        o.manifests = append(o.manifests, img.Target)
                }

                return nil
        }
}

// WithManifest adds a manifest to the exported archive.
// When names are given they will be set on the manifest in the
// exported archive, creating an index record for each name.
// When no names are provided, it is up to caller to put name annotation to
// on the manifest descriptor if needed.
func WithManifest(manifest ocispec.Descriptor, names ...string) ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                if len(names) == 0 {
                        o.manifests = append(o.manifests, manifest)
                }
                for _, name := range names {
                        mc := manifest
                        mc.Annotations = addNameAnnotation(name, manifest.Annotations)
                        o.manifests = append(o.manifests, mc)
                }

                return nil
        }
}

// BlobFilter returns false if the blob should not be included in the archive.
type BlobFilter func(ocispec.Descriptor) bool

// WithBlobFilter specifies BlobFilter.
func WithBlobFilter(f BlobFilter) ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                o.blobRecordOptions.blobFilter = f
                return nil
        }
}

// WithSkipNonDistributableBlobs excludes non-distributable blobs such as Windows base layers.
func WithSkipNonDistributableBlobs() ExportOpt {
        f := func(desc ocispec.Descriptor) bool {
                return !images.IsNonDistributable(desc.MediaType)
        }
        return WithBlobFilter(f)
}

// WithSkipMissing excludes blobs referenced by manifests if not all blobs
// would be included in the archive.
// The manifest itself is excluded only if it's not present locally.
// This allows to export multi-platform images if not all platforms are present
// while still persisting the multi-platform index.
func WithSkipMissing(store content.InfoReaderProvider) ExportOpt {
        return func(ctx context.Context, o *exportOptions) error {
                o.blobRecordOptions.childrenHandler = images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error) {
                        children, err := images.Children(ctx, store, desc)
                        if !images.IsManifestType(desc.MediaType) {
                                return children, err
                        }

                        if err != nil {
                                // If manifest itself is missing, skip it from export.
                                if errdefs.IsNotFound(err) {
                                        return nil, images.ErrSkipDesc
                                }
                                return nil, err
                        }

                        // Don't export manifest descendants if any of them doesn't exist.
                        for _, child := range children {
                                exists, err := content.Exists(ctx, store, child)
                                if err != nil {
                                        return nil, err
                                }

                                // If any child is missing, only export the manifest, but don't export its descendants.
                                if !exists {
                                        return nil, nil
                                }
                        }
                        return children, nil
                })
                return nil
        }
}

func addNameAnnotation(name string, base map[string]string) map[string]string {
        annotations := map[string]string{}
        for k, v := range base {
                annotations[k] = v
        }

        annotations[images.AnnotationImageName] = name
        annotations[ocispec.AnnotationRefName] = ociReferenceName(name)

        return annotations
}

func copySourceLabels(ctx context.Context, infoProvider content.InfoProvider, desc ocispec.Descriptor) (ocispec.Descriptor, error) {
        info, err := infoProvider.Info(ctx, desc.Digest)
        if err != nil {
                return desc, err
        }
        for k, v := range info.Labels {
                if strings.HasPrefix(k, labels.LabelDistributionSource) {
                        if desc.Annotations == nil {
                                desc.Annotations = map[string]string{k: v}
                        } else {
                                desc.Annotations[k] = v
                        }
                }
        }
        return desc, nil
}

// Export implements Exporter.
func Export(ctx context.Context, store content.InfoReaderProvider, writer io.Writer, opts ...ExportOpt) error {
        var eo exportOptions
        for _, opt := range opts {
                if err := opt(ctx, &eo); err != nil {
                        return err
                }
        }

        records := []tarRecord{
                ociLayoutFile(""),
        }

        manifests := make([]ocispec.Descriptor, 0, len(eo.manifests))
        for _, desc := range eo.manifests {
                d, err := copySourceLabels(ctx, store, desc)
                if err != nil {
                        log.G(ctx).WithError(err).WithField("desc", desc).Warn("failed to copy distribution.source labels")
                        continue
                }
                manifests = append(manifests, d)
        }

        algorithms := map[string]struct{}{}
        dManifests := map[digest.Digest]*exportManifest{}
        resolvedIndex := map[digest.Digest]digest.Digest{}
        for _, desc := range manifests {
                if images.IsManifestType(desc.MediaType) {
                        mt, ok := dManifests[desc.Digest]
                        if !ok {
                                // TODO(containerd): Skip if already added
                                r, err := getRecords(ctx, store, desc, algorithms, &eo.blobRecordOptions)
                                if err != nil {
                                        return err
                                }
                                records = append(records, r...)

                                mt = &exportManifest{
                                        manifest: desc,
                                }
                                dManifests[desc.Digest] = mt
                        }

                        name := desc.Annotations[images.AnnotationImageName]
                        if name != "" {
                                mt.names = append(mt.names, name)
                        }
                } else if images.IsIndexType(desc.MediaType) {
                        d, ok := resolvedIndex[desc.Digest]
                        if !ok {
                                if err := desc.Digest.Validate(); err != nil {
                                        return err
                                }
                                records = append(records, blobRecord(store, desc, &eo.blobRecordOptions))

                                p, err := content.ReadBlob(ctx, store, desc)
                                if err != nil {
                                        return err
                                }

                                var index ocispec.Index
                                if err := json.Unmarshal(p, &index); err != nil {
                                        return err
                                }

                                var manifests []ocispec.Descriptor
                                for _, m := range index.Manifests {
                                        if eo.platform != nil {
                                                if m.Platform == nil || eo.platform.Match(*m.Platform) {
                                                        manifests = append(manifests, m)
                                                } else if !eo.allPlatforms {
                                                        continue
                                                }
                                        }

                                        r, err := getRecords(ctx, store, m, algorithms, &eo.blobRecordOptions)
                                        if err != nil {
                                                return err
                                        }

                                        records = append(records, r...)
                                }

                                if len(manifests) >= 1 {
                                        if len(manifests) > 1 {
                                                sort.SliceStable(manifests, func(i, j int) bool {
                                                        if manifests[i].Platform == nil {
                                                                return false
                                                        }
                                                        if manifests[j].Platform == nil {
                                                                return true
                                                        }
                                                        return eo.platform.Less(*manifests[i].Platform, *manifests[j].Platform)
                                                })
                                        }
                                        d = manifests[0].Digest
                                        dManifests[d] = &exportManifest{
                                                manifest: manifests[0],
                                        }
                                } else if eo.platform != nil {
                                        return fmt.Errorf("no manifest found for platform: %w", errdefs.ErrNotFound)
                                }
                                resolvedIndex[desc.Digest] = d
                        }
                        if d != "" {
                                if name := desc.Annotations[images.AnnotationImageName]; name != "" {
                                        mt := dManifests[d]
                                        mt.names = append(mt.names, name)
                                }

                        }
                } else {
                        return fmt.Errorf("only manifests may be exported: %w", errdefs.ErrInvalidArgument)
                }
        }

        records = append(records, ociIndexRecord(manifests))

        if !eo.skipDockerManifest && len(dManifests) > 0 {
                tr, err := manifestsRecord(ctx, store, dManifests)
                if err != nil {
                        return fmt.Errorf("unable to create manifests file: %w", err)
                }

                records = append(records, tr)
        }

        if len(algorithms) > 0 {
                records = append(records, directoryRecord("blobs/", 0755))
                for alg := range algorithms {
                        records = append(records, directoryRecord("blobs/"+alg+"/", 0755))
                }
        }

        tw := tar.NewWriter(writer)
        defer tw.Close()
        return writeTar(ctx, tw, records)
}

func getRecords(ctx context.Context, store content.Provider, desc ocispec.Descriptor, algorithms map[string]struct{}, brOpts *blobRecordOptions) ([]tarRecord, error) {
        var records []tarRecord
        exportHandler := func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                if err := desc.Digest.Validate(); err != nil {
                        return nil, err
                }
                records = append(records, blobRecord(store, desc, brOpts))
                algorithms[desc.Digest.Algorithm().String()] = struct{}{}
                return nil, nil
        }

        childrenHandler := brOpts.childrenHandler
        if childrenHandler == nil {
                childrenHandler = images.ChildrenHandler(store)
        }

        handlers := images.Handlers(
                childrenHandler,
                images.HandlerFunc(exportHandler),
        )

        // Walk sequentially since the number of fetches is likely one and doing in
        // parallel requires locking the export handler
        if err := images.Walk(ctx, handlers, desc); err != nil {
                return nil, err
        }

        return records, nil
}

type tarRecord struct {
        Header *tar.Header
        CopyTo func(context.Context, io.Writer) (int64, error)
}

type blobRecordOptions struct {
        blobFilter      BlobFilter
        childrenHandler images.HandlerFunc
}

func blobRecord(cs content.Provider, desc ocispec.Descriptor, opts *blobRecordOptions) tarRecord {
        if opts != nil && opts.blobFilter != nil && !opts.blobFilter(desc) {
                return tarRecord{}
        }
        return tarRecord{
                Header: &tar.Header{
                        Name:     path.Join(ocispec.ImageBlobsDir, desc.Digest.Algorithm().String(), desc.Digest.Encoded()),
                        Mode:     0444,
                        Size:     desc.Size,
                        Typeflag: tar.TypeReg,
                },
                CopyTo: func(ctx context.Context, w io.Writer) (int64, error) {
                        r, err := cs.ReaderAt(ctx, desc)
                        if err != nil {
                                return 0, fmt.Errorf("failed to get reader: %w", err)
                        }
                        defer r.Close()

                        // Verify digest
                        dgstr := desc.Digest.Algorithm().Digester()

                        n, err := io.Copy(io.MultiWriter(w, dgstr.Hash()), content.NewReader(r))
                        if err != nil {
                                return 0, fmt.Errorf("failed to copy to tar: %w", err)
                        }
                        if dgstr.Digest() != desc.Digest {
                                return 0, fmt.Errorf("unexpected digest %s copied", dgstr.Digest())
                        }
                        return n, nil
                },
        }
}

func directoryRecord(name string, mode int64) tarRecord {
        return tarRecord{
                Header: &tar.Header{
                        Name:     name,
                        Mode:     mode,
                        Typeflag: tar.TypeDir,
                },
        }
}

func ociLayoutFile(version string) tarRecord {
        if version == "" {
                version = ocispec.ImageLayoutVersion
        }
        layout := ocispec.ImageLayout{
                Version: version,
        }

        b, err := json.Marshal(layout)
        if err != nil {
                panic(err)
        }

        return tarRecord{
                Header: &tar.Header{
                        Name:     ocispec.ImageLayoutFile,
                        Mode:     0444,
                        Size:     int64(len(b)),
                        Typeflag: tar.TypeReg,
                },
                CopyTo: func(ctx context.Context, w io.Writer) (int64, error) {
                        n, err := w.Write(b)
                        return int64(n), err
                },
        }

}

func ociIndexRecord(manifests []ocispec.Descriptor) tarRecord {
        index := ocispec.Index{
                Versioned: ocispecs.Versioned{
                        SchemaVersion: 2,
                },
                MediaType: ocispec.MediaTypeImageIndex,
                Manifests: manifests,
        }

        b, err := json.Marshal(index)
        if err != nil {
                panic(err)
        }

        return tarRecord{
                Header: &tar.Header{
                        Name:     ocispec.ImageIndexFile,
                        Mode:     0644,
                        Size:     int64(len(b)),
                        Typeflag: tar.TypeReg,
                },
                CopyTo: func(ctx context.Context, w io.Writer) (int64, error) {
                        n, err := w.Write(b)
                        return int64(n), err
                },
        }
}

type exportManifest struct {
        manifest ocispec.Descriptor
        names    []string
}

func manifestsRecord(ctx context.Context, store content.Provider, manifests map[digest.Digest]*exportManifest) (tarRecord, error) {
        mfsts := make([]struct {
                Config   string
                RepoTags []string
                Layers   []string
        }, len(manifests))

        var i int
        for _, m := range manifests {
                p, err := content.ReadBlob(ctx, store, m.manifest)
                if err != nil {
                        return tarRecord{}, err
                }

                var manifest ocispec.Manifest
                if err := json.Unmarshal(p, &manifest); err != nil {
                        return tarRecord{}, err
                }

                dgst := manifest.Config.Digest
                if err := dgst.Validate(); err != nil {
                        return tarRecord{}, err
                }
                mfsts[i].Config = path.Join(ocispec.ImageBlobsDir, dgst.Algorithm().String(), dgst.Encoded())
                for _, l := range manifest.Layers {
                        mfsts[i].Layers = append(mfsts[i].Layers, path.Join(ocispec.ImageBlobsDir, l.Digest.Algorithm().String(), l.Digest.Encoded()))
                }

                for _, name := range m.names {
                        nname, err := familiarizeReference(name)
                        if err != nil {
                                return tarRecord{}, err
                        }

                        mfsts[i].RepoTags = append(mfsts[i].RepoTags, nname)
                }

                i++
        }

        b, err := json.Marshal(mfsts)
        if err != nil {
                return tarRecord{}, err
        }

        return tarRecord{
                Header: &tar.Header{
                        Name:     "manifest.json",
                        Mode:     0644,
                        Size:     int64(len(b)),
                        Typeflag: tar.TypeReg,
                },
                CopyTo: func(ctx context.Context, w io.Writer) (int64, error) {
                        n, err := w.Write(b)
                        return int64(n), err
                },
        }, nil
}

func writeTar(ctx context.Context, tw *tar.Writer, recordsWithEmpty []tarRecord) error {
        var records []tarRecord
        for _, r := range recordsWithEmpty {
                if r.Header != nil {
                        records = append(records, r)
                }
        }
        sort.Slice(records, func(i, j int) bool {
                return records[i].Header.Name < records[j].Header.Name
        })

        var last string
        for _, record := range records {
                if record.Header.Name == last {
                        continue
                }
                last = record.Header.Name
                if err := tw.WriteHeader(record.Header); err != nil {
                        return err
                }
                if record.CopyTo != nil {
                        n, err := record.CopyTo(ctx, tw)
                        if err != nil {
                                return err
                        }
                        if n != record.Header.Size {
                                return fmt.Errorf("unexpected copy size for %s", record.Header.Name)
                        }
                } else if record.Header.Size > 0 {
                        return fmt.Errorf("no content to write to record with non-zero size for %s", record.Header.Name)
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package archive provides a Docker and OCI compatible importer
package archive

import (
        "archive/tar"
        "bytes"
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "path"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        digest "github.com/opencontainers/go-digest"
        specs "github.com/opencontainers/image-spec/specs-go"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type importOpts struct {
        compress bool
}

// ImportOpt is an option for importing an OCI index
type ImportOpt func(*importOpts) error

// WithImportCompression compresses uncompressed layers on import.
// This is used for import formats which do not include the manifest.
func WithImportCompression() ImportOpt {
        return func(io *importOpts) error {
                io.compress = true
                return nil
        }
}

// ImportIndex imports an index from a tar archive image bundle
//   - implements Docker v1.1, v1.2 and OCI v1.
//   - prefers OCI v1 when provided
//   - creates OCI index for Docker formats
//   - normalizes Docker references and adds as OCI ref name
//     e.g. alpine:latest -> docker.io/library/alpine:latest
//   - existing OCI reference names are untouched
func ImportIndex(ctx context.Context, store content.Store, reader io.Reader, opts ...ImportOpt) (ocispec.Descriptor, error) {
        var (
                tr = tar.NewReader(reader)

                ociLayout ocispec.ImageLayout
                mfsts     []struct {
                        Config   string
                        RepoTags []string
                        Layers   []string
                }
                symlinks = make(map[string]string)
                blobs    = make(map[string]ocispec.Descriptor)
                iopts    importOpts
        )

        for _, o := range opts {
                if err := o(&iopts); err != nil {
                        return ocispec.Descriptor{}, err
                }
        }

        for {
                hdr, err := tr.Next()
                if err == io.EOF {
                        break
                }
                if err != nil {
                        return ocispec.Descriptor{}, err
                }
                if hdr.Typeflag == tar.TypeSymlink {
                        symlinks[hdr.Name] = path.Join(path.Dir(hdr.Name), hdr.Linkname)
                }

                //nolint:staticcheck // TypeRegA is deprecated but we may still receive an external tar with TypeRegA
                if hdr.Typeflag != tar.TypeReg && hdr.Typeflag != tar.TypeRegA {
                        if hdr.Typeflag != tar.TypeDir {
                                log.G(ctx).WithField("file", hdr.Name).Debug("file type ignored")
                        }
                        continue
                }

                hdrName := path.Clean(hdr.Name)
                if hdrName == ocispec.ImageLayoutFile {
                        if err = onUntarJSON(tr, &ociLayout); err != nil {
                                return ocispec.Descriptor{}, fmt.Errorf("untar oci layout %q: %w", hdr.Name, err)
                        }
                } else if hdrName == "manifest.json" {
                        if err = onUntarJSON(tr, &mfsts); err != nil {
                                return ocispec.Descriptor{}, fmt.Errorf("untar manifest %q: %w", hdr.Name, err)
                        }
                } else {
                        dgst, err := onUntarBlob(ctx, tr, store, hdr.Size, "tar-"+hdrName)
                        if err != nil {
                                return ocispec.Descriptor{}, fmt.Errorf("failed to ingest %q: %w", hdr.Name, err)
                        }

                        blobs[hdrName] = ocispec.Descriptor{
                                Digest: dgst,
                                Size:   hdr.Size,
                        }
                }
        }

        // If OCI layout was given, interpret the tar as an OCI layout.
        // When not provided, the layout of the tar will be interpreted
        // as Docker v1.1 or v1.2.
        if ociLayout.Version != "" {
                if ociLayout.Version != ocispec.ImageLayoutVersion {
                        return ocispec.Descriptor{}, fmt.Errorf("unsupported OCI version %s", ociLayout.Version)
                }

                idx, ok := blobs[ocispec.ImageIndexFile]
                if !ok {
                        return ocispec.Descriptor{}, fmt.Errorf("missing index.json in OCI layout %s", ocispec.ImageLayoutVersion)
                }

                idx.MediaType = ocispec.MediaTypeImageIndex
                return idx, nil
        }

        if mfsts == nil {
                return ocispec.Descriptor{}, errors.New("unrecognized image format")
        }

        for name, linkname := range symlinks {
                desc, ok := blobs[linkname]
                if !ok {
                        return ocispec.Descriptor{}, fmt.Errorf("no target for symlink layer from %q to %q", name, linkname)
                }
                blobs[name] = desc
        }

        idx := ocispec.Index{
                Versioned: specs.Versioned{
                        SchemaVersion: 2,
                },
        }
        for _, mfst := range mfsts {
                config, ok := blobs[mfst.Config]
                if !ok {
                        return ocispec.Descriptor{}, fmt.Errorf("image config %q not found", mfst.Config)
                }
                config.MediaType = images.MediaTypeDockerSchema2Config

                layers, err := resolveLayers(ctx, store, mfst.Layers, blobs, iopts.compress)
                if err != nil {
                        return ocispec.Descriptor{}, fmt.Errorf("failed to resolve layers: %w", err)
                }

                manifest := struct {
                        SchemaVersion int                  `json:"schemaVersion"`
                        MediaType     string               `json:"mediaType"`
                        Config        ocispec.Descriptor   `json:"config"`
                        Layers        []ocispec.Descriptor `json:"layers"`
                }{
                        SchemaVersion: 2,
                        MediaType:     images.MediaTypeDockerSchema2Manifest,
                        Config:        config,
                        Layers:        layers,
                }

                desc, err := writeManifest(ctx, store, manifest, manifest.MediaType)
                if err != nil {
                        return ocispec.Descriptor{}, fmt.Errorf("write docker manifest: %w", err)
                }

                imgPlatforms, err := images.Platforms(ctx, store, desc)
                if err != nil {
                        return ocispec.Descriptor{}, fmt.Errorf("unable to resolve platform: %w", err)
                }
                if len(imgPlatforms) > 0 {
                        // Only one platform can be resolved from non-index manifest,
                        // The platform can only come from the config included above,
                        // if the config has no platform it can be safely omitted.
                        desc.Platform = &imgPlatforms[0]

                        // If the image we've just imported is a Windows image without the OSVersion set,
                        // we could just assume it matches this host's OS Version. Without this, the
                        // children labels might not be set on the image content, leading to it being
                        // garbage collected, breaking the image.
                        // See: https://github.com/containerd/containerd/issues/5690
                        if desc.Platform.OS == "windows" && desc.Platform.OSVersion == "" {
                                platform := platforms.DefaultSpec()
                                desc.Platform.OSVersion = platform.OSVersion
                        }
                }

                if len(mfst.RepoTags) == 0 {
                        idx.Manifests = append(idx.Manifests, desc)
                } else {
                        // Add descriptor per tag
                        for _, ref := range mfst.RepoTags {
                                mfstdesc := desc

                                normalized, err := normalizeReference(ref)
                                if err != nil {
                                        return ocispec.Descriptor{}, err
                                }

                                mfstdesc.Annotations = map[string]string{
                                        images.AnnotationImageName: normalized,
                                        ocispec.AnnotationRefName:  ociReferenceName(normalized),
                                }

                                idx.Manifests = append(idx.Manifests, mfstdesc)
                        }
                }
        }

        return writeManifest(ctx, store, idx, ocispec.MediaTypeImageIndex)
}

const (
        kib       = 1024
        mib       = 1024 * kib
        jsonLimit = 20 * mib
)

func onUntarJSON(r io.Reader, j interface{}) error {
        return json.NewDecoder(io.LimitReader(r, jsonLimit)).Decode(j)
}

func onUntarBlob(ctx context.Context, r io.Reader, store content.Ingester, size int64, ref string) (digest.Digest, error) {
        dgstr := digest.Canonical.Digester()

        if err := content.WriteBlob(ctx, store, ref, io.TeeReader(r, dgstr.Hash()), ocispec.Descriptor{Size: size}); err != nil {
                return "", err
        }

        return dgstr.Digest(), nil
}

func resolveLayers(ctx context.Context, store content.Store, layerFiles []string, blobs map[string]ocispec.Descriptor, compress bool) ([]ocispec.Descriptor, error) {
        layers := make([]ocispec.Descriptor, len(layerFiles))
        descs := map[digest.Digest]*ocispec.Descriptor{}
        filters := []string{}
        for i, f := range layerFiles {
                desc, ok := blobs[f]
                if !ok {
                        return nil, fmt.Errorf("layer %q not found", f)
                }
                layers[i] = desc
                descs[desc.Digest] = &layers[i]
                filters = append(filters, fmt.Sprintf("labels.\"%s\"==%s", labels.LabelUncompressed, desc.Digest.String()))
        }

        err := store.Walk(ctx, func(info content.Info) error {
                dgst, ok := info.Labels[labels.LabelUncompressed]
                if ok {
                        desc := descs[digest.Digest(dgst)]
                        if desc != nil {
                                desc.Digest = info.Digest
                                desc.Size = info.Size
                                mediaType, err := detectLayerMediaType(ctx, store, *desc)
                                if err != nil {
                                        return fmt.Errorf("failed to detect media type of layer: %w", err)
                                }
                                desc.MediaType = mediaType
                        }
                }
                return nil
        }, filters...)
        if err != nil {
                return nil, fmt.Errorf("failure checking for compressed blobs: %w", err)
        }

        for i, desc := range layers {
                if desc.MediaType != "" {
                        continue
                }
                // Open blob, resolve media type
                ra, err := store.ReaderAt(ctx, desc)
                if err != nil {
                        return nil, fmt.Errorf("failed to open %q (%s): %w", layerFiles[i], desc.Digest, err)
                }
                s, err := compression.DecompressStream(content.NewReader(ra))
                if err != nil {
                        ra.Close()
                        return nil, fmt.Errorf("failed to detect compression for %q: %w", layerFiles[i], err)
                }
                if s.GetCompression() == compression.Uncompressed {
                        if compress {
                                if err := desc.Digest.Validate(); err != nil {
                                        return nil, err
                                }
                                ref := fmt.Sprintf("compress-blob-%s-%s", desc.Digest.Algorithm().String(), desc.Digest.Encoded())
                                labels := map[string]string{
                                        labels.LabelUncompressed: desc.Digest.String(),
                                }
                                layers[i], err = compressBlob(ctx, store, s, ref, content.WithLabels(labels))
                                if err != nil {
                                        s.Close()
                                        ra.Close()
                                        return nil, err
                                }
                                layers[i].MediaType = images.MediaTypeDockerSchema2LayerGzip
                        } else {
                                layers[i].MediaType = images.MediaTypeDockerSchema2Layer
                        }
                } else {
                        layers[i].MediaType = images.MediaTypeDockerSchema2LayerGzip
                }
                s.Close()
                ra.Close()
        }
        return layers, nil
}

func compressBlob(ctx context.Context, cs content.Store, r io.Reader, ref string, opts ...content.Opt) (desc ocispec.Descriptor, err error) {
        w, err := content.OpenWriter(ctx, cs, content.WithRef(ref))
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to open writer: %w", err)
        }

        defer func() {
                w.Close()
                if err != nil {
                        cs.Abort(ctx, ref)
                }
        }()
        if err := w.Truncate(0); err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to truncate writer: %w", err)
        }

        cw, err := compression.CompressStream(w, compression.Gzip)
        if err != nil {
                return ocispec.Descriptor{}, err
        }

        if _, err := io.Copy(cw, r); err != nil {
                return ocispec.Descriptor{}, err
        }
        if err := cw.Close(); err != nil {
                return ocispec.Descriptor{}, err
        }

        cst, err := w.Status()
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to get writer status: %w", err)
        }

        desc.Digest = w.Digest()
        desc.Size = cst.Offset

        if err := w.Commit(ctx, desc.Size, desc.Digest, opts...); err != nil {
                if !errdefs.IsAlreadyExists(err) {
                        return ocispec.Descriptor{}, fmt.Errorf("failed to commit: %w", err)
                }
        }

        return desc, nil
}

func writeManifest(ctx context.Context, cs content.Ingester, manifest interface{}, mediaType string) (ocispec.Descriptor, error) {
        manifestBytes, err := json.Marshal(manifest)
        if err != nil {
                return ocispec.Descriptor{}, err
        }

        desc := ocispec.Descriptor{
                MediaType: mediaType,
                Digest:    digest.FromBytes(manifestBytes),
                Size:      int64(len(manifestBytes)),
        }
        if err := content.WriteBlob(ctx, cs, "manifest-"+desc.Digest.String(), bytes.NewReader(manifestBytes), desc); err != nil {
                return ocispec.Descriptor{}, err
        }

        return desc, nil
}

func detectLayerMediaType(ctx context.Context, store content.Store, desc ocispec.Descriptor) (string, error) {
        var mediaType string
        // need to parse existing blob to use the proper media type
        bytes := make([]byte, 10)
        ra, err := store.ReaderAt(ctx, desc)
        if err != nil {
                return "", fmt.Errorf("failed to read content store to detect layer media type: %w", err)
        }
        defer ra.Close()
        _, err = ra.ReadAt(bytes, 0)
        if err != nil && err != io.EOF {
                return "", fmt.Errorf("failed to read header bytes from layer to detect media type: %w", err)
        }
        if err == io.EOF {
                // in the case of an empty layer then the media type should be uncompressed
                return images.MediaTypeDockerSchema2Layer, nil
        }
        switch c := compression.DetectCompression(bytes); c {
        case compression.Uncompressed:
                mediaType = images.MediaTypeDockerSchema2Layer
        default:
                mediaType = images.MediaTypeDockerSchema2LayerGzip
        }
        return mediaType, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "fmt"
        "strings"

        "github.com/containerd/containerd/v2/pkg/reference"
        distref "github.com/distribution/reference"
        "github.com/opencontainers/go-digest"
)

// FilterRefPrefix restricts references to having the given image
// prefix. Tag-only references will have the prefix prepended.
func FilterRefPrefix(image string) func(string) string {
        return refTranslator(image, true)
}

// AddRefPrefix prepends the given image prefix to tag-only references,
// while leaving returning full references unmodified.
func AddRefPrefix(image string) func(string) string {
        return refTranslator(image, false)
}

// refTranslator creates a reference which only has a tag or verifies
// a full reference.
func refTranslator(image string, checkPrefix bool) func(string) string {
        return func(ref string) string {
                if image == "" {
                        return ""
                }
                // Check if ref is full reference
                if strings.ContainsAny(ref, "/:@") {
                        // If not prefixed, don't include image
                        if checkPrefix && !isImagePrefix(ref, image) {
                                return ""
                        }
                        return ref
                }
                return image + ":" + ref
        }
}

func isImagePrefix(s, prefix string) bool {
        if !strings.HasPrefix(s, prefix) {
                return false
        }
        if len(s) > len(prefix) {
                switch s[len(prefix)] {
                case '/', ':', '@':
                        // Prevent matching partial namespaces
                default:
                        return false
                }
        }
        return true
}

func normalizeReference(ref string) (string, error) {
        normalized, err := distref.ParseDockerRef(ref)
        if err != nil {
                return "", fmt.Errorf("normalize image ref %q: %w", ref, err)
        }

        return normalized.String(), nil
}

func familiarizeReference(ref string) (string, error) {
        named, err := distref.ParseNormalizedNamed(ref)
        if err != nil {
                return "", fmt.Errorf("failed to parse %q: %w", ref, err)
        }
        named = distref.TagNameOnly(named)

        return distref.FamiliarString(named), nil
}

func ociReferenceName(name string) string {
        // OCI defines the reference name as only a tag excluding the
        // repository. The containerd annotation contains the full image name
        // since the tag is insufficient for correctly naming and referring to an
        // image
        var ociRef string
        if spec, err := reference.Parse(name); err == nil {
                ociRef = spec.Object
        } else {
                ociRef = name
        }

        return ociRef
}

// DigestTranslator creates a digest reference by adding the
// digest to an image name
func DigestTranslator(prefix string) func(digest.Digest) string {
        return func(dgst digest.Digest) string {
                return prefix + "@" + dgst.String()
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "io"

        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/log"
)

// GetDiffID gets the diff ID of the layer blob descriptor.
func GetDiffID(ctx context.Context, cs content.Store, desc ocispec.Descriptor) (digest.Digest, error) {
        switch desc.MediaType {
        case
                // If the layer is already uncompressed, we can just return its digest
                MediaTypeDockerSchema2Layer,
                ocispec.MediaTypeImageLayer,
                MediaTypeDockerSchema2LayerForeign,
                ocispec.MediaTypeImageLayerNonDistributable: //nolint:staticcheck // deprecated
                return desc.Digest, nil
        }
        info, err := cs.Info(ctx, desc.Digest)
        if err != nil {
                return "", err
        }
        v, ok := info.Labels[labels.LabelUncompressed]
        if ok {
                // Fast path: if the image is already unpacked, we can use the label value
                return digest.Parse(v)
        }
        // if the image is not unpacked, we may not have the label
        ra, err := cs.ReaderAt(ctx, desc)
        if err != nil {
                return "", err
        }
        defer ra.Close()
        r := content.NewReader(ra)
        uR, err := compression.DecompressStream(r)
        if err != nil {
                return "", err
        }
        defer uR.Close()
        digester := digest.Canonical.Digester()
        hashW := digester.Hash()
        if _, err := io.Copy(hashW, uR); err != nil {
                return "", err
        }
        if err := ra.Close(); err != nil {
                return "", err
        }
        digest := digester.Digest()
        // memorize the computed value
        if info.Labels == nil {
                info.Labels = make(map[string]string)
        }
        info.Labels[labels.LabelUncompressed] = digest.String()
        if _, err := cs.Update(ctx, info, "labels"); err != nil {
                log.G(ctx).WithError(err).Warnf("failed to set %s label for %s", labels.LabelUncompressed, desc.Digest)
        }
        return digest, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "errors"
        "fmt"
        "sort"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "golang.org/x/sync/errgroup"
        "golang.org/x/sync/semaphore"
)

var (
        // ErrSkipDesc is used to skip processing of a descriptor and
        // its descendants.
        ErrSkipDesc = errors.New("skip descriptor")

        // ErrStopHandler is used to signify that the descriptor
        // has been handled and should not be handled further.
        // This applies only to a single descriptor in a handler
        // chain and does not apply to descendant descriptors.
        ErrStopHandler = errors.New("stop handler")

        // ErrEmptyWalk is used when the WalkNotEmpty handlers return no
        // children (e.g.: they were filtered out).
        ErrEmptyWalk = errors.New("image might be filtered out")
)

// Handler handles image manifests
type Handler interface {
        Handle(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error)
}

// HandlerFunc function implementing the Handler interface
type HandlerFunc func(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error)

// Handle image manifests
func (fn HandlerFunc) Handle(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error) {
        return fn(ctx, desc)
}

// Handlers returns a handler that will run the handlers in sequence.
//
// A handler may return `ErrStopHandler` to stop calling additional handlers
func Handlers(handlers ...Handler) HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error) {
                var children []ocispec.Descriptor
                for _, handler := range handlers {
                        ch, err := handler.Handle(ctx, desc)
                        if err != nil {
                                if errors.Is(err, ErrStopHandler) {
                                        break
                                }
                                return nil, err
                        }

                        children = append(children, ch...)
                }

                return children, nil
        }
}

// Walk the resources of an image and call the handler for each. If the handler
// decodes the sub-resources for each image,
//
// This differs from dispatch in that each sibling resource is considered
// synchronously.
func Walk(ctx context.Context, handler Handler, descs ...ocispec.Descriptor) error {
        for _, desc := range descs {

                children, err := handler.Handle(ctx, desc)
                if err != nil {
                        if errors.Is(err, ErrSkipDesc) {
                                continue // don't traverse the children.
                        }
                        return err
                }

                if len(children) > 0 {
                        if err := Walk(ctx, handler, children...); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// WalkNotEmpty works the same way Walk does, with the exception that it ensures that
// some children are still found by Walking the descriptors (for example, not all of
// them have been filtered out by one of the handlers). If there are no children,
// then an ErrEmptyWalk error is returned.
func WalkNotEmpty(ctx context.Context, handler Handler, descs ...ocispec.Descriptor) error {
        isEmpty := true
        var notEmptyHandler HandlerFunc = func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                children, err := handler.Handle(ctx, desc)
                if err != nil {
                        return children, err
                }

                if len(children) > 0 {
                        isEmpty = false
                }

                return children, nil
        }

        err := Walk(ctx, notEmptyHandler, descs...)
        if err != nil {
                return err
        }

        if isEmpty {
                return ErrEmptyWalk
        }

        return nil
}

// Dispatch runs the provided handler for content specified by the descriptors.
// If the handler decode subresources, they will be visited, as well.
//
// Handlers for siblings are run in parallel on the provided descriptors. A
// handler may return `ErrSkipDesc` to signal to the dispatcher to not traverse
// any children.
//
// A concurrency limiter can be passed in to limit the number of concurrent
// handlers running. When limiter is nil, there is no limit.
//
// Typically, this function will be used with `FetchHandler`, often composed
// with other handlers.
//
// If any handler returns an error, the dispatch session will be canceled.
func Dispatch(ctx context.Context, handler Handler, limiter *semaphore.Weighted, descs ...ocispec.Descriptor) error {
        eg, ctx2 := errgroup.WithContext(ctx)
        for _, desc := range descs {
                desc := desc

                if limiter != nil {
                        if err := limiter.Acquire(ctx, 1); err != nil {
                                return err
                        }
                }

                eg.Go(func() error {
                        desc := desc

                        children, err := handler.Handle(ctx2, desc)
                        if limiter != nil {
                                limiter.Release(1)
                        }
                        if err != nil {
                                if errors.Is(err, ErrSkipDesc) {
                                        return nil // don't traverse the children.
                                }
                                return err
                        }

                        if len(children) > 0 {
                                return Dispatch(ctx2, handler, limiter, children...)
                        }

                        return nil
                })
        }

        return eg.Wait()
}

// ChildrenHandler decodes well-known manifest types and returns their children.
//
// This is useful for supporting recursive fetch and other use cases where you
// want to do a full walk of resources.
//
// One can also replace this with another implementation to allow descending of
// arbitrary types.
func ChildrenHandler(provider content.Provider) HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                return Children(ctx, provider, desc)
        }
}

// SetChildrenLabels is a handler wrapper which sets labels for the content on
// the children returned by the handler and passes through the children.
// Must follow a handler that returns the children to be labeled.
func SetChildrenLabels(manager content.Manager, f HandlerFunc) HandlerFunc {
        return SetChildrenMappedLabels(manager, f, nil)
}

// SetChildrenMappedLabels is a handler wrapper which sets labels for the content on
// the children returned by the handler and passes through the children.
// Must follow a handler that returns the children to be labeled.
// The label map allows the caller to control the labels per child descriptor.
// For returned labels, the index of the child will be appended to the end
// except for the first index when the returned label does not end with '.'.
func SetChildrenMappedLabels(manager content.Manager, f HandlerFunc, labelMap func(ocispec.Descriptor) []string) HandlerFunc {
        if labelMap == nil {
                labelMap = ChildGCLabels
        }
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                children, err := f(ctx, desc)
                if err != nil {
                        return children, err
                }

                if len(children) > 0 {
                        var (
                                info = content.Info{
                                        Digest: desc.Digest,
                                        Labels: map[string]string{},
                                }
                                fields = []string{}
                                keys   = map[string]uint{}
                        )
                        for _, ch := range children {
                                labelKeys := labelMap(ch)
                                for _, key := range labelKeys {
                                        idx := keys[key]
                                        keys[key] = idx + 1
                                        if idx > 0 || key[len(key)-1] == '.' {
                                                key = fmt.Sprintf("%s%d", key, idx)
                                        }

                                        info.Labels[key] = ch.Digest.String()
                                        fields = append(fields, "labels."+key)
                                }
                        }

                        _, err := manager.Update(ctx, info, fields...)
                        if err != nil {
                                return nil, err
                        }
                }

                return children, err
        }
}

// FilterPlatforms is a handler wrapper which limits the descriptors returned
// based on matching the specified platform matcher.
func FilterPlatforms(f HandlerFunc, m platforms.Matcher) HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                children, err := f(ctx, desc)
                if err != nil {
                        return children, err
                }

                var descs []ocispec.Descriptor

                if m == nil {
                        descs = children
                } else {
                        for _, d := range children {
                                if d.Platform == nil || m.Match(*d.Platform) {
                                        descs = append(descs, d)
                                }
                        }
                }

                return descs, nil
        }
}

// LimitManifests is a handler wrapper which filters the manifest descriptors
// returned using the provided platform.
// The results will be ordered according to the comparison operator and
// use the ordering in the manifests for equal matches.
// A limit of 0 or less is considered no limit.
// A not found error is returned if no manifest is matched.
func LimitManifests(f HandlerFunc, m platforms.MatchComparer, n int) HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                children, err := f(ctx, desc)
                if err != nil {
                        return children, err
                }

                // only limit manifests from an index
                if IsIndexType(desc.MediaType) {
                        sort.SliceStable(children, func(i, j int) bool {
                                if children[i].Platform == nil {
                                        return false
                                }
                                if children[j].Platform == nil {
                                        return true
                                }
                                return m.Less(*children[i].Platform, *children[j].Platform)
                        })

                        if n > 0 {
                                if len(children) == 0 {
                                        return children, fmt.Errorf("no match for platform in manifest: %w", errdefs.ErrNotFound)
                                }
                                if len(children) > n {
                                        children = children[:n]
                                }
                        }
                }
                return children, nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "encoding/json"
        "fmt"
        "sort"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Image provides the model for how containerd views container images.
type Image struct {
        // Name of the image.
        //
        // To be pulled, it must be a reference compatible with resolvers.
        //
        // This field is required.
        Name string

        // Labels provide runtime decoration for the image record.
        //
        // There is no default behavior for how these labels are propagated. They
        // only decorate the static metadata object.
        //
        // This field is optional.
        Labels map[string]string

        // Target describes the root content for this image. Typically, this is
        // a manifest, index or manifest list.
        Target ocispec.Descriptor

        CreatedAt, UpdatedAt time.Time
}

// DeleteOptions provide options on image delete
type DeleteOptions struct {
        Synchronous bool
        Target      *ocispec.Descriptor
}

// DeleteOpt allows configuring a delete operation
type DeleteOpt func(context.Context, *DeleteOptions) error

// SynchronousDelete is used to indicate that an image deletion and removal of
// the image resources should occur synchronously before returning a result.
func SynchronousDelete() DeleteOpt {
        return func(ctx context.Context, o *DeleteOptions) error {
                o.Synchronous = true
                return nil
        }
}

// DeleteTarget is used to specify the target value an image is expected
// to have when deleting. If the image has a different target, then
// NotFound is returned.
func DeleteTarget(target *ocispec.Descriptor) DeleteOpt {
        return func(ctx context.Context, o *DeleteOptions) error {
                o.Target = target
                return nil
        }
}

// Store and interact with images
type Store interface {
        Get(ctx context.Context, name string) (Image, error)
        List(ctx context.Context, filters ...string) ([]Image, error)
        Create(ctx context.Context, image Image) (Image, error)

        // Update will replace the data in the store with the provided image. If
        // one or more fieldpaths are provided, only those fields will be updated.
        Update(ctx context.Context, image Image, fieldpaths ...string) (Image, error)

        Delete(ctx context.Context, name string, opts ...DeleteOpt) error
}

// TODO(stevvooe): Many of these functions make strong platform assumptions,
// which are untrue in a lot of cases. More refactoring must be done here to
// make this work in all cases.

// Config resolves the image configuration descriptor.
//
// The caller can then use the descriptor to resolve and process the
// configuration of the image.
func (image *Image) Config(ctx context.Context, provider content.Provider, platform platforms.MatchComparer) (ocispec.Descriptor, error) {
        return Config(ctx, provider, image.Target, platform)
}

// RootFS returns the unpacked diffids that make up and images rootfs.
//
// These are used to verify that a set of layers unpacked to the expected
// values.
func (image *Image) RootFS(ctx context.Context, provider content.Provider, platform platforms.MatchComparer) ([]digest.Digest, error) {
        desc, err := image.Config(ctx, provider, platform)
        if err != nil {
                return nil, err
        }
        return RootFS(ctx, provider, desc)
}

// Size returns the total size of an image's packed resources.
func (image *Image) Size(ctx context.Context, provider content.Provider, platform platforms.MatchComparer) (int64, error) {
        var size int64
        return size, Walk(ctx, Handlers(HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                if desc.Size < 0 {
                        return nil, fmt.Errorf("invalid size %v in %v (%v)", desc.Size, desc.Digest, desc.MediaType)
                }
                size += desc.Size
                return nil, nil
        }), LimitManifests(FilterPlatforms(ChildrenHandler(provider), platform), platform, 1)), image.Target)
}

type platformManifest struct {
        p *ocispec.Platform
        m *ocispec.Manifest
}

// Manifest resolves a manifest from the image for the given platform.
//
// When a manifest descriptor inside of a manifest index does not have
// a platform defined, the platform from the image config is considered.
//
// If the descriptor points to a non-index manifest, then the manifest is
// unmarshalled and returned without considering the platform inside of the
// config.
//
// TODO(stevvooe): This violates the current platform agnostic approach to this
// package by returning a specific manifest type. We'll need to refactor this
// to return a manifest descriptor or decide that we want to bring the API in
// this direction because this abstraction is not needed.
func Manifest(ctx context.Context, provider content.Provider, image ocispec.Descriptor, platform platforms.MatchComparer) (ocispec.Manifest, error) {
        var (
                limit    = 1
                m        []platformManifest
                wasIndex bool
        )

        if err := Walk(ctx, HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                if IsManifestType(desc.MediaType) {
                        p, err := content.ReadBlob(ctx, provider, desc)
                        if err != nil {
                                return nil, err
                        }

                        if err := validateMediaType(p, desc.MediaType); err != nil {
                                return nil, fmt.Errorf("manifest: invalid desc %s: %w", desc.Digest, err)
                        }

                        var manifest ocispec.Manifest
                        if err := json.Unmarshal(p, &manifest); err != nil {
                                return nil, err
                        }

                        if desc.Digest != image.Digest && platform != nil {
                                if desc.Platform != nil && !platform.Match(*desc.Platform) {
                                        return nil, nil
                                }

                                if desc.Platform == nil {
                                        imagePlatform, err := ConfigPlatform(ctx, provider, manifest.Config)
                                        if err != nil {
                                                return nil, err
                                        }
                                        if !platform.Match(imagePlatform) {
                                                return nil, nil
                                        }

                                }
                        }

                        m = append(m, platformManifest{
                                p: desc.Platform,
                                m: &manifest,
                        })

                        return nil, nil
                } else if IsIndexType(desc.MediaType) {
                        p, err := content.ReadBlob(ctx, provider, desc)
                        if err != nil {
                                return nil, err
                        }

                        if err := validateMediaType(p, desc.MediaType); err != nil {
                                return nil, fmt.Errorf("manifest: invalid desc %s: %w", desc.Digest, err)
                        }

                        var idx ocispec.Index
                        if err := json.Unmarshal(p, &idx); err != nil {
                                return nil, err
                        }

                        if platform == nil {
                                return idx.Manifests, nil
                        }

                        var descs []ocispec.Descriptor
                        for _, d := range idx.Manifests {
                                if d.Platform == nil || platform.Match(*d.Platform) {
                                        descs = append(descs, d)
                                }
                        }

                        sort.SliceStable(descs, func(i, j int) bool {
                                if descs[i].Platform == nil {
                                        return false
                                }
                                if descs[j].Platform == nil {
                                        return true
                                }
                                return platform.Less(*descs[i].Platform, *descs[j].Platform)
                        })

                        wasIndex = true

                        if len(descs) > limit {
                                return descs[:limit], nil
                        }
                        return descs, nil
                }
                return nil, fmt.Errorf("unexpected media type %v for %v: %w", desc.MediaType, desc.Digest, errdefs.ErrNotFound)
        }), image); err != nil {
                return ocispec.Manifest{}, err
        }

        if len(m) == 0 {
                err := fmt.Errorf("manifest %v: %w", image.Digest, errdefs.ErrNotFound)
                if wasIndex {
                        err = fmt.Errorf("no match for platform in manifest %v: %w", image.Digest, errdefs.ErrNotFound)
                }
                return ocispec.Manifest{}, err
        }
        return *m[0].m, nil
}

// Config resolves the image configuration descriptor using a content provided
// to resolve child resources on the image.
//
// The caller can then use the descriptor to resolve and process the
// configuration of the image.
func Config(ctx context.Context, provider content.Provider, image ocispec.Descriptor, platform platforms.MatchComparer) (ocispec.Descriptor, error) {
        manifest, err := Manifest(ctx, provider, image, platform)
        if err != nil {
                return ocispec.Descriptor{}, err
        }
        return manifest.Config, nil
}

// Platforms returns one or more platforms supported by the image.
func Platforms(ctx context.Context, provider content.Provider, image ocispec.Descriptor) ([]ocispec.Platform, error) {
        var platformSpecs []ocispec.Platform
        return platformSpecs, Walk(ctx, Handlers(HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                if desc.Platform != nil {
                        platformSpecs = append(platformSpecs, *desc.Platform)
                        return nil, ErrSkipDesc
                }

                if IsConfigType(desc.MediaType) {
                        imagePlatform, err := ConfigPlatform(ctx, provider, desc)
                        if err != nil {
                                return nil, err
                        }
                        platformSpecs = append(platformSpecs, imagePlatform)
                }
                return nil, nil
        }), ChildrenHandler(provider)), image)
}

// Check returns nil if the all components of an image are available in the
// provider for the specified platform.
//
// If available is true, the caller can assume that required represents the
// complete set of content required for the image.
//
// missing will have the components that are part of required but not available
// in the provider.
//
// If there is a problem resolving content, an error will be returned.
func Check(ctx context.Context, provider content.Provider, image ocispec.Descriptor, platform platforms.MatchComparer) (available bool, required, present, missing []ocispec.Descriptor, err error) {
        mfst, err := Manifest(ctx, provider, image, platform)
        if err != nil {
                if errdefs.IsNotFound(err) {
                        return false, []ocispec.Descriptor{image}, nil, []ocispec.Descriptor{image}, nil
                }

                return false, nil, nil, nil, fmt.Errorf("failed to check image %v: %w", image.Digest, err)
        }

        // TODO(stevvooe): It is possible that referenced components could have
        // children, but this is rare. For now, we ignore this and only verify
        // that manifest components are present.
        required = append([]ocispec.Descriptor{mfst.Config}, mfst.Layers...)

        for _, desc := range required {
                ra, err := provider.ReaderAt(ctx, desc)
                if err != nil {
                        if errdefs.IsNotFound(err) {
                                missing = append(missing, desc)
                                continue
                        } else {
                                return false, nil, nil, nil, fmt.Errorf("failed to check image %v: %w", desc.Digest, err)
                        }
                }
                ra.Close()
                present = append(present, desc)

        }

        return true, required, present, missing, nil
}

// Children returns the immediate children of content described by the descriptor.
func Children(ctx context.Context, provider content.Provider, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
        if IsManifestType(desc.MediaType) {
                p, err := content.ReadBlob(ctx, provider, desc)
                if err != nil {
                        return nil, err
                }

                if err := validateMediaType(p, desc.MediaType); err != nil {
                        return nil, fmt.Errorf("children: invalid desc %s: %w", desc.Digest, err)
                }

                // TODO(stevvooe): We just assume oci manifest, for now. There may be
                // subtle differences from the docker version.
                var manifest ocispec.Manifest
                if err := json.Unmarshal(p, &manifest); err != nil {
                        return nil, err
                }

                return append([]ocispec.Descriptor{manifest.Config}, manifest.Layers...), nil
        } else if IsIndexType(desc.MediaType) {
                p, err := content.ReadBlob(ctx, provider, desc)
                if err != nil {
                        return nil, err
                }

                if err := validateMediaType(p, desc.MediaType); err != nil {
                        return nil, fmt.Errorf("children: invalid desc %s: %w", desc.Digest, err)
                }

                var index ocispec.Index
                if err := json.Unmarshal(p, &index); err != nil {
                        return nil, err
                }

                return append([]ocispec.Descriptor{}, index.Manifests...), nil
        } else if !IsLayerType(desc.MediaType) && !IsKnownConfig(desc.MediaType) {
                // Layers and configs are childless data types and should not be logged.
                log.G(ctx).Debugf("encountered unknown type %v; children may not be fetched", desc.MediaType)
        }
        return nil, nil
}

// unknownDocument represents a manifest, manifest list, or index that has not
// yet been validated.
type unknownDocument struct {
        MediaType string          `json:"mediaType,omitempty"`
        Config    json.RawMessage `json:"config,omitempty"`
        Layers    json.RawMessage `json:"layers,omitempty"`
        Manifests json.RawMessage `json:"manifests,omitempty"`
        FSLayers  json.RawMessage `json:"fsLayers,omitempty"` // schema 1
}

// validateMediaType returns an error if the byte slice is invalid JSON,
// if the format of the blob is not supported, or if the media type
// identifies the blob as one format, but it identifies itself as, or
// contains elements of another format.
func validateMediaType(b []byte, mt string) error {
        var doc unknownDocument
        if err := json.Unmarshal(b, &doc); err != nil {
                return err
        }
        if len(doc.FSLayers) != 0 {
                return fmt.Errorf("media-type: schema 1 not supported")
        }
        if IsManifestType(mt) && (len(doc.Manifests) != 0 || IsIndexType(doc.MediaType)) {
                return fmt.Errorf("media-type: expected manifest but found index (%s)", mt)
        } else if IsIndexType(mt) && (len(doc.Config) != 0 || len(doc.Layers) != 0 || IsManifestType(doc.MediaType)) {
                return fmt.Errorf("media-type: expected index but found manifest (%s)", mt)
        }
        return nil
}

// RootFS returns the unpacked diffids that make up and images rootfs.
//
// These are used to verify that a set of layers unpacked to the expected
// values.
func RootFS(ctx context.Context, provider content.Provider, configDesc ocispec.Descriptor) ([]digest.Digest, error) {
        p, err := content.ReadBlob(ctx, provider, configDesc)
        if err != nil {
                return nil, err
        }

        var config ocispec.Image
        if err := json.Unmarshal(p, &config); err != nil {
                return nil, err
        }
        return config.RootFS.DiffIDs, nil
}

// ConfigPlatform returns a normalized platform from an image manifest config.
func ConfigPlatform(ctx context.Context, provider content.Provider, configDesc ocispec.Descriptor) (ocispec.Platform, error) {
        p, err := content.ReadBlob(ctx, provider, configDesc)
        if err != nil {
                return ocispec.Platform{}, err
        }

        // Technically, this should be ocispec.Image, but we only need the
        // ocispec.Platform that is embedded in the image struct.
        var imagePlatform ocispec.Platform
        if err := json.Unmarshal(p, &imagePlatform); err != nil {
                return ocispec.Platform{}, err
        }
        return platforms.Normalize(imagePlatform), nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "fmt"
        "sort"
        "strings"

        "github.com/containerd/errdefs"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// mediatype definitions for image components handled in containerd.
//
// oci components are generally referenced directly, although we may centralize
// here for clarity.
const (
        MediaTypeDockerSchema2Layer            = "application/vnd.docker.image.rootfs.diff.tar"
        MediaTypeDockerSchema2LayerForeign     = "application/vnd.docker.image.rootfs.foreign.diff.tar"
        MediaTypeDockerSchema2LayerGzip        = "application/vnd.docker.image.rootfs.diff.tar.gzip"
        MediaTypeDockerSchema2LayerZstd        = "application/vnd.docker.image.rootfs.diff.tar.zstd"
        MediaTypeDockerSchema2LayerForeignGzip = "application/vnd.docker.image.rootfs.foreign.diff.tar.gzip"
        MediaTypeDockerSchema2Config           = "application/vnd.docker.container.image.v1+json"
        MediaTypeDockerSchema2Manifest         = "application/vnd.docker.distribution.manifest.v2+json"
        MediaTypeDockerSchema2ManifestList     = "application/vnd.docker.distribution.manifest.list.v2+json"

        // Checkpoint/Restore Media Types

        MediaTypeContainerd1Checkpoint               = "application/vnd.containerd.container.criu.checkpoint.criu.tar"
        MediaTypeContainerd1CheckpointPreDump        = "application/vnd.containerd.container.criu.checkpoint.predump.tar"
        MediaTypeContainerd1Resource                 = "application/vnd.containerd.container.resource.tar"
        MediaTypeContainerd1RW                       = "application/vnd.containerd.container.rw.tar"
        MediaTypeContainerd1CheckpointConfig         = "application/vnd.containerd.container.checkpoint.config.v1+proto"
        MediaTypeContainerd1CheckpointOptions        = "application/vnd.containerd.container.checkpoint.options.v1+proto"
        MediaTypeContainerd1CheckpointRuntimeName    = "application/vnd.containerd.container.checkpoint.runtime.name"
        MediaTypeContainerd1CheckpointRuntimeOptions = "application/vnd.containerd.container.checkpoint.runtime.options+proto"

        // MediaTypeDockerSchema1Manifest is the legacy Docker schema1 manifest
        MediaTypeDockerSchema1Manifest = "application/vnd.docker.distribution.manifest.v1+prettyjws"

        // Encrypted media types

        MediaTypeImageLayerEncrypted     = ocispec.MediaTypeImageLayer + "+encrypted"
        MediaTypeImageLayerGzipEncrypted = ocispec.MediaTypeImageLayerGzip + "+encrypted"
)

// DiffCompression returns the compression as defined by the layer diff media
// type. For Docker media types without compression, "unknown" is returned to
// indicate that the media type may be compressed. If the media type is not
// recognized as a layer diff, then it returns errdefs.ErrNotImplemented
func DiffCompression(ctx context.Context, mediaType string) (string, error) {
        base, ext := parseMediaTypes(mediaType)
        switch base {
        case MediaTypeDockerSchema2Layer, MediaTypeDockerSchema2LayerForeign:
                if len(ext) > 0 {
                        // Type is wrapped
                        return "", nil
                }
                // These media types may have been compressed but failed to
                // use the correct media type. The decompression function
                // should detect and handle this case.
                return "unknown", nil
        case MediaTypeDockerSchema2LayerGzip, MediaTypeDockerSchema2LayerForeignGzip:
                if len(ext) > 0 {
                        // Type is wrapped
                        return "", nil
                }
                return "gzip", nil
        case MediaTypeDockerSchema2LayerZstd:
                if len(ext) > 0 {
                        // Type is wrapped
                        return "", nil
                }
                return "zstd", nil
        case ocispec.MediaTypeImageLayer, ocispec.MediaTypeImageLayerNonDistributable: //nolint:staticcheck // Non-distributable layers are deprecated
                if len(ext) > 0 {
                        switch ext[len(ext)-1] {
                        case "gzip":
                                return "gzip", nil
                        case "zstd":
                                return "zstd", nil
                        }
                }
                return "", nil
        default:
                return "", fmt.Errorf("unrecognised mediatype %s: %w", mediaType, errdefs.ErrNotImplemented)
        }
}

// parseMediaTypes splits the media type into the base type and
// an array of sorted extensions
func parseMediaTypes(mt string) (mediaType string, suffixes []string) {
        if mt == "" {
                return "", []string{}
        }
        mediaType, ext, ok := strings.Cut(mt, "+")
        if !ok {
                return mediaType, []string{}
        }

        // Splitting the extensions following the mediatype "(+)gzip+encrypted".
        // We expect this to be a limited list, so add an arbitrary limit (50).
        //
        // Note that DiffCompression is only using the last element, so perhaps we
        // should split on the last "+" only.
        suffixes = strings.SplitN(ext, "+", 50)
        sort.Strings(suffixes)
        return mediaType, suffixes
}

// IsNonDistributable returns true if the media type is non-distributable.
func IsNonDistributable(mt string) bool {
        return strings.HasPrefix(mt, "application/vnd.oci.image.layer.nondistributable.") ||
                strings.HasPrefix(mt, "application/vnd.docker.image.rootfs.foreign.")
}

// IsLayerType returns true if the media type is a layer
func IsLayerType(mt string) bool {
        if strings.HasPrefix(mt, "application/vnd.oci.image.layer.") {
                return true
        }

        // Parse Docker media types, strip off any + suffixes first
        switch base, _ := parseMediaTypes(mt); base {
        case MediaTypeDockerSchema2Layer, MediaTypeDockerSchema2LayerGzip,
                MediaTypeDockerSchema2LayerForeign, MediaTypeDockerSchema2LayerForeignGzip, MediaTypeDockerSchema2LayerZstd:
                return true
        }
        return false
}

// IsDockerType returns true if the media type has "application/vnd.docker." prefix
func IsDockerType(mt string) bool {
        return strings.HasPrefix(mt, "application/vnd.docker.")
}

// IsManifestType returns true if the media type is an OCI-compatible manifest.
// No support for schema1 manifest.
func IsManifestType(mt string) bool {
        switch mt {
        case MediaTypeDockerSchema2Manifest, ocispec.MediaTypeImageManifest:
                return true
        default:
                return false
        }
}

// IsIndexType returns true if the media type is an OCI-compatible index.
func IsIndexType(mt string) bool {
        switch mt {
        case ocispec.MediaTypeImageIndex, MediaTypeDockerSchema2ManifestList:
                return true
        default:
                return false
        }
}

// IsConfigType returns true if the media type is an OCI-compatible image config.
// No support for containerd checkpoint configs.
func IsConfigType(mt string) bool {
        switch mt {
        case MediaTypeDockerSchema2Config, ocispec.MediaTypeImageConfig:
                return true
        default:
                return false
        }
}

// IsKnownConfig returns true if the media type is a known config type,
// including containerd checkpoint configs
func IsKnownConfig(mt string) bool {
        switch mt {
        case MediaTypeDockerSchema2Config, ocispec.MediaTypeImageConfig,
                MediaTypeContainerd1Checkpoint, MediaTypeContainerd1CheckpointConfig:
                return true
        }
        return false
}

// ChildGCLabels returns the label for a given descriptor to reference it
func ChildGCLabels(desc ocispec.Descriptor) []string {
        mt := desc.MediaType
        if IsKnownConfig(mt) {
                return []string{"containerd.io/gc.ref.content.config"}
        }

        switch mt {
        case MediaTypeDockerSchema2Manifest, ocispec.MediaTypeImageManifest:
                return []string{"containerd.io/gc.ref.content.m."}
        }

        if IsLayerType(mt) {
                return []string{"containerd.io/gc.ref.content.l."}
        }

        return []string{"containerd.io/gc.ref.content."}
}

// ChildGCLabelsFilterLayers returns the labels for a given descriptor to
// reference it, skipping layer media types
func ChildGCLabelsFilterLayers(desc ocispec.Descriptor) []string {
        if IsLayerType(desc.MediaType) {
                return nil
        }
        return ChildGCLabels(desc)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package usage

import (
        "context"
        "strings"
        "sync/atomic"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "golang.org/x/sync/semaphore"
)

type usageOptions struct {
        platform      platforms.MatchComparer
        manifestLimit int
        manifestOnly  bool
        snapshots     func(name string) snapshots.Snapshotter
}

type Opt func(*usageOptions) error

// WithManifestLimit sets the limit to the number of manifests which will
// be walked for usage. Setting this value to 0 will require all manifests to
// be walked, returning ErrNotFound if manifests are missing.
// NOTE: By default all manifests which exist will be walked
// and any non-existent manifests and their subobjects will be ignored.
func WithManifestLimit(platform platforms.MatchComparer, i int) Opt {
        // If 0 then don't filter any manifests
        // By default limits to current platform
        return func(o *usageOptions) error {
                o.manifestLimit = i
                o.platform = platform
                return nil
        }
}

// WithSnapshotters will check for referenced snapshots from the image objects
// and include the snapshot size in the total usage.
func WithSnapshotters(f func(string) snapshots.Snapshotter) Opt {
        return func(o *usageOptions) error {
                o.snapshots = f
                return nil
        }
}

// WithManifestUsage is used to get the usage for an image based on what is
// reported by the manifests rather than what exists in the content store.
// NOTE: This function is best used with the manifest limit set to get a
// consistent value, otherwise non-existent manifests will be excluded.
func WithManifestUsage() Opt {
        return func(o *usageOptions) error {
                o.manifestOnly = true
                return nil
        }
}

func CalculateImageUsage(ctx context.Context, i images.Image, provider content.InfoReaderProvider, opts ...Opt) (int64, error) {
        var config usageOptions
        for _, opt := range opts {
                if err := opt(&config); err != nil {
                        return 0, err
                }
        }

        var (
                handler   = images.ChildrenHandler(provider)
                size      int64
                mustExist bool
        )

        if config.platform != nil {
                handler = images.LimitManifests(handler, config.platform, config.manifestLimit)
                mustExist = true
        }

        var wh images.HandlerFunc = func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                var usage int64
                children, err := handler(ctx, desc)
                if err != nil {
                        if !errdefs.IsNotFound(err) || mustExist {
                                return nil, err
                        }
                        if !config.manifestOnly {
                                // Do not count size of non-existent objects
                                desc.Size = 0
                        }
                } else if config.snapshots != nil || !config.manifestOnly {
                        info, err := provider.Info(ctx, desc.Digest)
                        if err != nil {
                                if !errdefs.IsNotFound(err) {
                                        return nil, err
                                }
                                if !config.manifestOnly {
                                        // Do not count size of non-existent objects
                                        desc.Size = 0
                                }
                        } else {
                                if info.Size > desc.Size {
                                        // Count actual usage, Size may be unset or -1
                                        desc.Size = info.Size
                                }

                                if config.snapshots != nil {
                                        for k, v := range info.Labels {
                                                const prefix = "containerd.io/gc.ref.snapshot."
                                                if !strings.HasPrefix(k, prefix) {
                                                        continue
                                                }

                                                sn := config.snapshots(k[len(prefix):])
                                                if sn == nil {
                                                        continue
                                                }

                                                u, err := sn.Usage(ctx, v)
                                                if err != nil {
                                                        if !errdefs.IsNotFound(err) && !errdefs.IsInvalidArgument(err) {
                                                                return nil, err
                                                        }
                                                } else {
                                                        usage += u.Size
                                                }
                                        }
                                }
                        }
                }

                // Ignore unknown sizes. Generally unknown sizes should
                // never be set in manifests, however, the usage
                // calculation does not need to enforce this.
                if desc.Size >= 0 {
                        usage += desc.Size
                }

                atomic.AddInt64(&size, usage)

                return children, nil
        }

        l := semaphore.NewWeighted(3)
        if err := images.Dispatch(ctx, wh, l, i.Target); err != nil {
                return 0, err
        }

        return size, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package introspectionproxy

import (
        "context"
        "fmt"

        api "github.com/containerd/containerd/api/services/introspection/v1"
        "github.com/containerd/containerd/v2/core/introspection"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/ttrpc"
        "google.golang.org/grpc"
        "google.golang.org/protobuf/types/known/anypb"
        "google.golang.org/protobuf/types/known/emptypb"
)

var _ = (introspection.Service)(&introspectionRemote{})

// NewIntrospectionServiceFromClient creates a new introspection service from an API client
func NewIntrospectionProxy(client any) introspection.Service {
        switch c := client.(type) {
        case api.IntrospectionClient:
                return &introspectionRemote{client: convertIntrospection{c}}
        case api.TTRPCIntrospectionService:
                return &introspectionRemote{client: c}
        case grpc.ClientConnInterface:
                return &introspectionRemote{client: convertIntrospection{api.NewIntrospectionClient(c)}}
        case *ttrpc.Client:
                return &introspectionRemote{client: api.NewTTRPCIntrospectionClient(c)}
        default:
                panic(fmt.Errorf("unsupported introspection client %T: %w", client, errdefs.ErrNotImplemented))
        }
}

type introspectionRemote struct {
        client api.TTRPCIntrospectionService
}

func (i *introspectionRemote) Plugins(ctx context.Context, filters ...string) (*api.PluginsResponse, error) {
        log.G(ctx).WithField("filters", filters).Debug("remote introspection plugin filters")
        resp, err := i.client.Plugins(ctx, &api.PluginsRequest{
                Filters: filters,
        })

        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        return resp, nil
}

func (i *introspectionRemote) Server(ctx context.Context) (*api.ServerResponse, error) {
        resp, err := i.client.Server(ctx, &emptypb.Empty{})

        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        return resp, nil
}

func (i *introspectionRemote) PluginInfo(ctx context.Context, pluginType, id string, options any) (resp *api.PluginInfoResponse, err error) {
        var optionsPB *anypb.Any
        if options != nil {
                optionsPB, err = protobuf.MarshalAnyToProto(options)
                if err != nil {
                        return nil, fmt.Errorf("failed to marshal runtime requst: %w", err)
                }
        }
        resp, err = i.client.PluginInfo(ctx, &api.PluginInfoRequest{
                Type:    pluginType,
                ID:      id,
                Options: optionsPB,
        })

        return resp, errdefs.FromGRPC(err)
}

type convertIntrospection struct {
        client api.IntrospectionClient
}

func (c convertIntrospection) Plugins(ctx context.Context, req *api.PluginsRequest) (*api.PluginsResponse, error) {
        return c.client.Plugins(ctx, req)
}
func (c convertIntrospection) Server(ctx context.Context, in *emptypb.Empty) (*api.ServerResponse, error) {
        return c.client.Server(ctx, in)
}
func (c convertIntrospection) PluginInfo(ctx context.Context, req *api.PluginInfoRequest) (*api.PluginInfoResponse, error) {
        return c.client.PluginInfo(ctx, req)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package leases

import "context"

type leaseKey struct{}

// WithLease sets a given lease on the context
func WithLease(ctx context.Context, lid string) context.Context {
        ctx = context.WithValue(ctx, leaseKey{}, lid)

        // also store on the grpc headers so it gets picked up by any clients that
        // are using this.
        return withGRPCLeaseHeader(ctx, lid)
}

// FromContext returns the lease from the context.
func FromContext(ctx context.Context) (string, bool) {
        lid, ok := ctx.Value(leaseKey{}).(string)
        if !ok {
                return fromGRPCHeader(ctx)
        }

        return lid, ok
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package leases

import (
        "context"

        "google.golang.org/grpc/metadata"
)

const (
        // GRPCHeader defines the header name for specifying a containerd lease.
        GRPCHeader = "containerd-lease"
)

func withGRPCLeaseHeader(ctx context.Context, lid string) context.Context {
        // also store on the grpc headers so it gets picked up by any clients
        // that are using this.
        txheader := metadata.Pairs(GRPCHeader, lid)
        md, ok := metadata.FromOutgoingContext(ctx) // merge with outgoing context.
        if !ok {
                md = txheader
        } else {
                // order ensures the latest is first in this list.
                md = metadata.Join(txheader, md)
        }

        return metadata.NewOutgoingContext(ctx, md)
}

func fromGRPCHeader(ctx context.Context) (string, bool) {
        // try to extract for use in grpc servers.
        md, ok := metadata.FromIncomingContext(ctx)
        if !ok {
                return "", false
        }

        values := md[GRPCHeader]
        if len(values) == 0 {
                return "", false
        }

        return values[0], true
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package leases

import (
        "crypto/rand"
        "encoding/base64"
        "fmt"
        "time"
)

// WithRandomID sets the lease ID to a random unique value
func WithRandomID() Opt {
        return func(l *Lease) error {
                t := time.Now()
                var b [3]byte
                rand.Read(b[:])
                l.ID = fmt.Sprintf("%d-%s", t.Nanosecond(), base64.URLEncoding.EncodeToString(b[:]))
                return nil
        }
}

// WithID sets the ID for the lease
func WithID(id string) Opt {
        return func(l *Lease) error {
                l.ID = id
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package leases

import (
        "context"
        "time"
)

// Opt is used to set options on a lease
type Opt func(*Lease) error

// DeleteOpt allows configuring a delete operation
type DeleteOpt func(context.Context, *DeleteOptions) error

// Manager is used to create, list, and remove leases
type Manager interface {
        Create(context.Context, ...Opt) (Lease, error)
        Delete(context.Context, Lease, ...DeleteOpt) error
        List(context.Context, ...string) ([]Lease, error)
        AddResource(context.Context, Lease, Resource) error
        DeleteResource(context.Context, Lease, Resource) error
        ListResources(context.Context, Lease) ([]Resource, error)
}

// Lease retains resources to prevent cleanup before
// the resources can be fully referenced.
type Lease struct {
        ID        string
        CreatedAt time.Time
        Labels    map[string]string
}

// Resource represents low level resource of image, like content, ingest and
// snapshotter.
type Resource struct {
        ID   string
        Type string
}

// DeleteOptions provide options on image delete
type DeleteOptions struct {
        Synchronous bool
}

// SynchronousDelete is used to indicate that a lease deletion and removal of
// any unreferenced resources should occur synchronously before returning the
// result.
func SynchronousDelete(ctx context.Context, o *DeleteOptions) error {
        o.Synchronous = true
        return nil
}

// WithLabel sets a label on a lease, and merges it with existing labels.
// It overwrites the existing value of the given label (if present).
func WithLabel(label, value string) Opt {
        return func(l *Lease) error {
                if l.Labels == nil {
                        l.Labels = map[string]string{label: value}
                        return nil
                }
                l.Labels[label] = value
                return nil
        }
}

// WithLabels merges labels on a lease
func WithLabels(labels map[string]string) Opt {
        return func(l *Lease) error {
                if l.Labels == nil {
                        l.Labels = map[string]string{}
                }
                for k, v := range labels {
                        l.Labels[k] = v
                }
                return nil
        }
}

// WithExpiration sets an expiration on the lease
func WithExpiration(d time.Duration) Opt {
        return func(l *Lease) error {
                if l.Labels == nil {
                        l.Labels = map[string]string{}
                }
                l.Labels["containerd.io/gc.expire"] = time.Now().Add(d).Format(time.RFC3339)

                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"

        leasesapi "github.com/containerd/containerd/api/services/leases/v1"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
)

type proxyManager struct {
        client leasesapi.LeasesClient
}

// NewLeaseManager returns a lease manager which communicates
// through a grpc lease service.
func NewLeaseManager(client leasesapi.LeasesClient) leases.Manager {
        return &proxyManager{
                client: client,
        }
}

func (pm *proxyManager) Create(ctx context.Context, opts ...leases.Opt) (leases.Lease, error) {
        l := leases.Lease{}
        for _, opt := range opts {
                if err := opt(&l); err != nil {
                        return leases.Lease{}, err
                }
        }
        resp, err := pm.client.Create(ctx, &leasesapi.CreateRequest{
                ID:     l.ID,
                Labels: l.Labels,
        })
        if err != nil {
                return leases.Lease{}, errdefs.FromGRPC(err)
        }

        return leases.Lease{
                ID:        resp.Lease.ID,
                CreatedAt: protobuf.FromTimestamp(resp.Lease.CreatedAt),
                Labels:    resp.Lease.Labels,
        }, nil
}

func (pm *proxyManager) Delete(ctx context.Context, l leases.Lease, opts ...leases.DeleteOpt) error {
        var do leases.DeleteOptions
        for _, opt := range opts {
                if err := opt(ctx, &do); err != nil {
                        return err
                }
        }

        _, err := pm.client.Delete(ctx, &leasesapi.DeleteRequest{
                ID:   l.ID,
                Sync: do.Synchronous,
        })
        return errdefs.FromGRPC(err)
}

func (pm *proxyManager) List(ctx context.Context, filters ...string) ([]leases.Lease, error) {
        resp, err := pm.client.List(ctx, &leasesapi.ListRequest{
                Filters: filters,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        l := make([]leases.Lease, len(resp.Leases))
        for i := range resp.Leases {
                l[i] = leases.Lease{
                        ID:        resp.Leases[i].ID,
                        CreatedAt: protobuf.FromTimestamp(resp.Leases[i].CreatedAt),
                        Labels:    resp.Leases[i].Labels,
                }
        }

        return l, nil
}

func (pm *proxyManager) AddResource(ctx context.Context, lease leases.Lease, r leases.Resource) error {
        _, err := pm.client.AddResource(ctx, &leasesapi.AddResourceRequest{
                ID: lease.ID,
                Resource: &leasesapi.Resource{
                        ID:   r.ID,
                        Type: r.Type,
                },
        })
        return errdefs.FromGRPC(err)
}

func (pm *proxyManager) DeleteResource(ctx context.Context, lease leases.Lease, r leases.Resource) error {
        _, err := pm.client.DeleteResource(ctx, &leasesapi.DeleteResourceRequest{
                ID: lease.ID,
                Resource: &leasesapi.Resource{
                        ID:   r.ID,
                        Type: r.Type,
                },
        })
        return errdefs.FromGRPC(err)
}

func (pm *proxyManager) ListResources(ctx context.Context, lease leases.Lease) ([]leases.Resource, error) {
        resp, err := pm.client.ListResources(ctx, &leasesapi.ListResourcesRequest{
                ID: lease.ID,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        rs := make([]leases.Resource, 0, len(resp.Resources))
        for _, i := range resp.Resources {
                rs = append(rs, leases.Resource{
                        ID:   i.ID,
                        Type: i.Type,
                })
        }
        return rs, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "strings"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/filters"
)

func adaptImage(o interface{}) filters.Adaptor {
        obj := o.(images.Image)
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "name":
                        return obj.Name, len(obj.Name) > 0
                case "target":
                        if len(fieldpath) < 2 {
                                return "", false
                        }

                        switch fieldpath[1] {
                        case "digest":
                                return obj.Target.Digest.String(), len(obj.Target.Digest) > 0
                        case "mediatype":
                                return obj.Target.MediaType, len(obj.Target.MediaType) > 0
                        }
                case "labels":
                        return checkMap(fieldpath[1:], obj.Labels)
                        // TODO(stevvooe): Greater/Less than filters would be awesome for
                        // size. Let's do it!
                case "annotations":
                        return checkMap(fieldpath[1:], obj.Target.Annotations)
                }

                return "", false
        })
}
func adaptContainer(o interface{}) filters.Adaptor {
        obj := o.(containers.Container)
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "id":
                        return obj.ID, len(obj.ID) > 0
                case "runtime":
                        if len(fieldpath) <= 1 {
                                return "", false
                        }

                        switch fieldpath[1] {
                        case "name":
                                return obj.Runtime.Name, len(obj.Runtime.Name) > 0
                        default:
                                return "", false
                        }
                case "image":
                        return obj.Image, len(obj.Image) > 0
                case "labels":
                        return checkMap(fieldpath[1:], obj.Labels)
                }

                return "", false
        })
}

func adaptContentStatus(status content.Status) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }
                switch fieldpath[0] {
                case "ref":
                        return status.Ref, true
                }

                return "", false
        })
}

func adaptLease(lease leases.Lease) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "id":
                        return lease.ID, len(lease.ID) > 0
                case "labels":
                        return checkMap(fieldpath[1:], lease.Labels)
                }

                return "", false
        })
}

func adaptSnapshot(info snapshots.Info) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "kind":
                        switch info.Kind {
                        case snapshots.KindActive:
                                return "active", true
                        case snapshots.KindView:
                                return "view", true
                        case snapshots.KindCommitted:
                                return "committed", true
                        }
                case "name":
                        return info.Name, true
                case "parent":
                        return info.Parent, true
                case "labels":
                        return checkMap(fieldpath[1:], info.Labels)
                }

                return "", false
        })
}

func adaptSandbox(instance *sandbox.Sandbox) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "id":
                        return instance.ID, true
                case "labels":
                        return checkMap(fieldpath[1:], instance.Labels)
                default:
                        return "", false
                }
        })
}

func checkMap(fieldpath []string, m map[string]string) (string, bool) {
        if len(m) == 0 {
                return "", false
        }

        value, ok := m[strings.Join(fieldpath, ".")]
        return value, ok
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "fmt"

        bolt "go.etcd.io/bbolt"
)

type transactionKey struct{}

// WithTransactionContext returns a new context holding the provided
// bolt transaction. Functions which require a bolt transaction will
// first check to see if a transaction is already created on the
// context before creating their own.
func WithTransactionContext(ctx context.Context, tx *bolt.Tx) context.Context {
        return context.WithValue(ctx, transactionKey{}, tx)
}

type transactor interface {
        View(fn func(*bolt.Tx) error) error
        Update(fn func(*bolt.Tx) error) error
}

// view gets a bolt db transaction either from the context
// or starts a new one with the provided bolt database.
func view(ctx context.Context, db transactor, fn func(*bolt.Tx) error) error {
        tx, ok := ctx.Value(transactionKey{}).(*bolt.Tx)
        if !ok {
                return db.View(fn)
        }
        return fn(tx)
}

// update gets a writable bolt db transaction either from the context
// or starts a new one with the provided bolt database.
func update(ctx context.Context, db transactor, fn func(*bolt.Tx) error) error {
        tx, ok := ctx.Value(transactionKey{}).(*bolt.Tx)
        if !ok {
                return db.Update(fn)
        } else if !tx.Writable() {
                return fmt.Errorf("unable to use transaction from context: %w", bolt.ErrTxNotWritable)
        }
        return fn(tx)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package boltutil

import (
        "fmt"
        "time"

        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/typeurl/v2"
        bolt "go.etcd.io/bbolt"
)

var (
        bucketKeyAnnotations = []byte("annotations")
        bucketKeyLabels      = []byte("labels")
        bucketKeyCreatedAt   = []byte("createdat")
        bucketKeyUpdatedAt   = []byte("updatedat")
        bucketKeyExtensions  = []byte("extensions")
)

// ReadLabels reads the labels key from the bucket
// Uses the key "labels"
func ReadLabels(bkt *bolt.Bucket) (map[string]string, error) {
        return readMap(bkt, bucketKeyLabels)
}

// ReadAnnotations reads the OCI Descriptor Annotations key from the bucket
// Uses the key "annotations"
func ReadAnnotations(bkt *bolt.Bucket) (map[string]string, error) {
        return readMap(bkt, bucketKeyAnnotations)
}

func readMap(bkt *bolt.Bucket, bucketName []byte) (map[string]string, error) {
        lbkt := bkt.Bucket(bucketName)
        if lbkt == nil {
                return nil, nil
        }
        labels := map[string]string{}
        if err := lbkt.ForEach(func(k, v []byte) error {
                labels[string(k)] = string(v)
                return nil
        }); err != nil {
                return nil, err
        }
        return labels, nil
}

// WriteLabels will write a new labels bucket to the provided bucket at key
// bucketKeyLabels, replacing the contents of the bucket with the provided map.
//
// The provide map labels will be modified to have the final contents of the
// bucket. Typically, this removes zero-value entries.
// Uses the key "labels"
func WriteLabels(bkt *bolt.Bucket, labels map[string]string) error {
        return writeMap(bkt, bucketKeyLabels, labels)
}

// WriteAnnotations writes the OCI Descriptor Annotations
func WriteAnnotations(bkt *bolt.Bucket, labels map[string]string) error {
        return writeMap(bkt, bucketKeyAnnotations, labels)
}

func writeMap(bkt *bolt.Bucket, bucketName []byte, labels map[string]string) error {
        // Remove existing labels to keep from merging
        if lbkt := bkt.Bucket(bucketName); lbkt != nil {
                if err := bkt.DeleteBucket(bucketName); err != nil {
                        return err
                }
        }

        if len(labels) == 0 {
                return nil
        }

        lbkt, err := bkt.CreateBucket(bucketName)
        if err != nil {
                return err
        }

        for k, v := range labels {
                if v == "" {
                        delete(labels, k) // remove since we don't actually set it
                        continue
                }

                if err := lbkt.Put([]byte(k), []byte(v)); err != nil {
                        return fmt.Errorf("failed to set label %q=%q: %w", k, v, err)
                }
        }

        return nil
}

// ReadTimestamps reads created and updated timestamps from a bucket.
// Uses keys "createdat" and "updatedat"
func ReadTimestamps(bkt *bolt.Bucket, created, updated *time.Time) error {
        for _, f := range []struct {
                b []byte
                t *time.Time
        }{
                {bucketKeyCreatedAt, created},
                {bucketKeyUpdatedAt, updated},
        } {
                v := bkt.Get(f.b)
                if v != nil {
                        if err := f.t.UnmarshalBinary(v); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// WriteTimestamps writes created and updated timestamps to a bucket.
// Uses keys "createdat" and "updatedat"
func WriteTimestamps(bkt *bolt.Bucket, created, updated time.Time) error {
        createdAt, err := created.MarshalBinary()
        if err != nil {
                return err
        }
        updatedAt, err := updated.MarshalBinary()
        if err != nil {
                return err
        }
        for _, v := range [][2][]byte{
                {bucketKeyCreatedAt, createdAt},
                {bucketKeyUpdatedAt, updatedAt},
        } {
                if err := bkt.Put(v[0], v[1]); err != nil {
                        return err
                }
        }

        return nil
}

// WriteExtensions will write a KV map to the given bucket,
// where `K` is a string key and `V` is a protobuf's Any type that represents a generic extension.
func WriteExtensions(bkt *bolt.Bucket, extensions map[string]typeurl.Any) error {
        if len(extensions) == 0 {
                return nil
        }

        ebkt, err := bkt.CreateBucketIfNotExists(bucketKeyExtensions)
        if err != nil {
                return err
        }

        for name, ext := range extensions {
                ext := protobuf.FromAny(ext)
                p, err := proto.Marshal(ext)
                if err != nil {
                        return err
                }

                if err := ebkt.Put([]byte(name), p); err != nil {
                        return err
                }
        }

        return nil
}

// ReadExtensions will read back a map of extensions from the given bucket, previously written by WriteExtensions
func ReadExtensions(bkt *bolt.Bucket) (map[string]typeurl.Any, error) {
        var (
                extensions = make(map[string]typeurl.Any)
                ebkt       = bkt.Bucket(bucketKeyExtensions)
        )

        if ebkt == nil {
                return extensions, nil
        }

        if err := ebkt.ForEach(func(k, v []byte) error {
                var t types.Any
                if err := proto.Unmarshal(v, &t); err != nil {
                        return err
                }

                extensions[string(k)] = &t
                return nil
        }); err != nil {
                return nil, err
        }

        return extensions, nil
}

// WriteAny write a protobuf's Any type to the bucket
func WriteAny(bkt *bolt.Bucket, name []byte, any typeurl.Any) error {
        pbany := protobuf.FromAny(any)
        if pbany == nil {
                return nil
        }

        data, err := proto.Marshal(pbany)
        if err != nil {
                return fmt.Errorf("failed to marshal: %w", err)
        }

        if err := bkt.Put(name, data); err != nil {
                return fmt.Errorf("put failed: %w", err)
        }

        return nil
}

// ReadAny reads back protobuf's Any type from the bucket
func ReadAny(bkt *bolt.Bucket, name []byte) (*types.Any, error) {
        bytes := bkt.Get(name)
        if bytes == nil {
                return nil, nil
        }

        out := types.Any{}
        if err := proto.Unmarshal(bytes, &out); err != nil {
                return nil, fmt.Errorf("failed to unmarshal any: %w", err)
        }

        return &out, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package metadata stores all labels and object specific metadata by namespace.
// This package also contains the main garbage collection logic for cleaning up
// resources consistently and atomically. Resources used by backends will be
// tracked in the metadata store to be exposed to consumers of this package.
//
// The layout where a "/" delineates a bucket is described in the following
// section. Please try to follow this as closely as possible when adding
// functionality. We can bolster this with helpers and more structure if that
// becomes an issue.
//
// Generically, we try to do the following:
//
//        <version>/<namespace>/<object>/<key> -> <field>
//
// version: Currently, this is "v1". Additions can be made to v1 in a backwards
// compatible way. If the layout changes, a new version must be made, along
// with a migration.
//
// namespace: the namespace to which this object belongs.
//
// object: defines which object set is stored in the bucket. There are two
// special objects, "labels" and "indexes". The "labels" bucket stores the
// labels for the parent namespace. The "indexes" object is reserved for
// indexing objects, if we require in the future.
//
// key: object-specific key identifying the storage bucket for the objects
// contents.
//
// Below is the current database schema. This should be updated each time
// the structure is changed in addition to adding a migration and incrementing
// the database version.
// Notes:
//
//   - `╘══*...*` refers to maps with arbitrary keys
//
//   - `version` is a key to a numeric value identifying the minor revisions
//     of schema version
//
//   - a namespace in a schema bucket cannot be named "version"
//
/*
 └──v1                                        - Schema version bucket
    ├──version : <varint>                     - Latest version, see migrations
    ╘══*namespace*
          ├──labels
          │  ╘══*key* : <string>                 - Label value
          ├──image
          │  ╘══*image name*
          │     ├──createdat : <binary time>     - Created at
          │     ├──updatedat : <binary time>     - Updated at
          │     ├──target
          │     │  ├──digest : <digest>          - Descriptor digest
          │     │  ├──mediatype : <string>       - Descriptor media type
          │     │  └──size : <varint>            - Descriptor size
          │     └──labels
          │        ╘══*key* : <string>           - Label value
          ├──containers
          │  ╘══*container id*
          │     ├──createdat : <binary time>     - Created at
          │     ├──updatedat : <binary time>     - Updated at
          │     ├──spec : <binary>               - Proto marshaled spec
          │     ├──image : <string>              - Image name
          │     ├──snapshotter : <string>        - Snapshotter name
          │     ├──snapshotKey : <string>        - Snapshot key
          │     ├──runtime
          │     │  ├──name : <string>            - Runtime name
          │     │  └──options : <binary>         - Proto marshaled options
          │     ├──extensions
          │     │     ╘══*name* : <binary>       - Proto marshaled extension
          │     └──labels
          │        ╘══*key* : <string>           - Label value
          ├──snapshots
          │  ╘══*snapshotter*
          │     ╘══*snapshot key*
          │        ├──name : <string>            - Snapshot name in backend
          │        ├──createdat : <binary time>  - Created at
          │        ├──updatedat : <binary time>  - Updated at
          │        ├──parent : <string>          - Parent snapshot name
          │        ├──children
          │        │  ╘══*snapshot key* : <nil>  - Child snapshot reference
          │        └──labels
          │           ╘══*key* : <string>        - Label value
          ├──content
          │  ├──blob
          │  │  ╘══*blob digest*
          │  │     ├──createdat : <binary time>  - Created at
          │  │     ├──updatedat : <binary time>  - Updated at
          │  │     ├──size : <varint>            - Blob size
          │  │     └──labels
          │  │        ╘══*key* : <string>        - Label value
          │  └──ingests
          │     ╘══*ingest reference*
          │        ├──ref : <string>             - Ingest reference in backend
          │        ├──expireat : <binary time>   - Time to expire ingest
          │        └──expected : <digest>        - Expected commit digest
          ├──sandboxes
          │  ╘══*sandbox id*
          │     ├──createdat : <binary time>     - Created at
          │     ├──updatedat : <binary time>     - Updated at
          │     ├──spec : <binary>               - Proto marshaled spec
          │     ├──sandboxer : <string>          - Sandboxer name
          │     ├──runtime
          │     │  ├──name : <string>            - Runtime name
          │     │  └──options : <binary>         - Proto marshaled options
          │     ├──extensions
          │     │  ╘══*name* : <binary>       - Proto marshaled extension
          │     └──labels
          │        ╘══*key* : <string>           - Label value
          └──leases
             ╘══*lease id*
                 ├──createdat : <binary time>     - Created at
                 ├──labels
                 │  ╘══*key* : <string>           - Label value
                 ├──snapshots
                 │  ╘══*snapshotter*
                 │     ╘══*snapshot key* : <nil>  - Snapshot reference
                 ├──content
                 │  ╘══*blob digest* : <nil>      - Content blob reference
                 └─────ingests
                       ╘══*ingest reference* : <nil> - Content ingest reference
*/
package metadata

import (
        digest "github.com/opencontainers/go-digest"
        bolt "go.etcd.io/bbolt"
)

var (
        bucketKeyVersion          = []byte(schemaVersion)
        bucketKeyDBVersion        = []byte("version")    // stores the version of the schema
        bucketKeyObjectLabels     = []byte("labels")     // stores the labels for a namespace.
        bucketKeyObjectImages     = []byte("images")     // stores image objects
        bucketKeyObjectContainers = []byte("containers") // stores container objects
        bucketKeyObjectSnapshots  = []byte("snapshots")  // stores snapshot references
        bucketKeyObjectContent    = []byte("content")    // stores content references
        bucketKeyObjectBlob       = []byte("blob")       // stores content links
        bucketKeyObjectIngests    = []byte("ingests")    // stores ingest objects
        bucketKeyObjectLeases     = []byte("leases")     // stores leases
        bucketKeyObjectSandboxes  = []byte("sandboxes")  // stores sandboxes

        bucketKeyDigest      = []byte("digest")
        bucketKeyMediaType   = []byte("mediatype")
        bucketKeySize        = []byte("size")
        bucketKeyImage       = []byte("image")
        bucketKeyRuntime     = []byte("runtime")
        bucketKeyName        = []byte("name")
        bucketKeyParent      = []byte("parent")
        bucketKeyChildren    = []byte("children")
        bucketKeyOptions     = []byte("options")
        bucketKeySpec        = []byte("spec")
        bucketKeySnapshotKey = []byte("snapshotKey")
        bucketKeySnapshotter = []byte("snapshotter")
        bucketKeyTarget      = []byte("target")
        bucketKeyExtensions  = []byte("extensions")
        bucketKeyCreatedAt   = []byte("createdat")
        bucketKeyExpected    = []byte("expected")
        bucketKeyRef         = []byte("ref")
        bucketKeyExpireAt    = []byte("expireat")
        bucketKeySandboxID   = []byte("sandboxid")
        bucketKeySandboxer   = []byte("sandboxer")

        deprecatedBucketKeyObjectIngest = []byte("ingest") // stores ingest links, deprecated in v1.2
)

func getBucket(tx *bolt.Tx, keys ...[]byte) *bolt.Bucket {
        bkt := tx.Bucket(keys[0])

        for _, key := range keys[1:] {
                if bkt == nil {
                        break
                }
                bkt = bkt.Bucket(key)
        }

        return bkt
}

func createBucketIfNotExists(tx *bolt.Tx, keys ...[]byte) (*bolt.Bucket, error) {
        bkt, err := tx.CreateBucketIfNotExists(keys[0])
        if err != nil {
                return nil, err
        }

        for _, key := range keys[1:] {
                bkt, err = bkt.CreateBucketIfNotExists(key)
                if err != nil {
                        return nil, err
                }
        }

        return bkt, nil
}

func namespaceLabelsBucketPath(namespace string) [][]byte {
        return [][]byte{bucketKeyVersion, []byte(namespace), bucketKeyObjectLabels}
}

func withNamespacesLabelsBucket(tx *bolt.Tx, namespace string, fn func(bkt *bolt.Bucket) error) error {
        bkt, err := createBucketIfNotExists(tx, namespaceLabelsBucketPath(namespace)...)
        if err != nil {
                return err
        }

        return fn(bkt)
}

func getNamespaceLabelsBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(tx, namespaceLabelsBucketPath(namespace)...)
}

func imagesBucketPath(namespace string) [][]byte {
        return [][]byte{bucketKeyVersion, []byte(namespace), bucketKeyObjectImages}
}

func createImagesBucket(tx *bolt.Tx, namespace string) (*bolt.Bucket, error) {
        return createBucketIfNotExists(tx, imagesBucketPath(namespace)...)
}

func getImagesBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(tx, imagesBucketPath(namespace)...)
}

func createContainersBucket(tx *bolt.Tx, namespace string) (*bolt.Bucket, error) {
        return createBucketIfNotExists(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContainers)
}

func getContainersBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContainers)
}

func getContainerBucket(tx *bolt.Tx, namespace, id string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContainers, []byte(id))
}

func createSnapshotterBucket(tx *bolt.Tx, namespace, snapshotter string) (*bolt.Bucket, error) {
        bkt, err := createBucketIfNotExists(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectSnapshots, []byte(snapshotter))
        if err != nil {
                return nil, err
        }
        return bkt, nil
}

func getSnapshottersBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectSnapshots)
}

func getSnapshotterBucket(tx *bolt.Tx, namespace, snapshotter string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectSnapshots, []byte(snapshotter))
}

func createBlobBucket(tx *bolt.Tx, namespace string, dgst digest.Digest) (*bolt.Bucket, error) {
        bkt, err := createBucketIfNotExists(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContent, bucketKeyObjectBlob)
        if err != nil {
                return nil, err
        }
        return bkt.CreateBucket([]byte(dgst.String()))
}

func getBlobsBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContent, bucketKeyObjectBlob)
}

func getBlobBucket(tx *bolt.Tx, namespace string, dgst digest.Digest) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContent, bucketKeyObjectBlob, []byte(dgst.String()))
}

func getIngestsBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContent, bucketKeyObjectIngests)
}

func createIngestBucket(tx *bolt.Tx, namespace, ref string) (*bolt.Bucket, error) {
        bkt, err := createBucketIfNotExists(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContent, bucketKeyObjectIngests, []byte(ref))
        if err != nil {
                return nil, err
        }
        return bkt, nil
}

func getIngestBucket(tx *bolt.Tx, namespace, ref string) *bolt.Bucket {
        return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContent, bucketKeyObjectIngests, []byte(ref))
}

func createSandboxBucket(tx *bolt.Tx, namespace string) (*bolt.Bucket, error) {
        return createBucketIfNotExists(
                tx,
                []byte(namespace),
                bucketKeyObjectSandboxes,
        )
}

func getSandboxBucket(tx *bolt.Tx, namespace string) *bolt.Bucket {
        return getBucket(
                tx,
                []byte(namespace),
                bucketKeyObjectSandboxes,
        )
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "fmt"
        "strings"
        "sync/atomic"
        "time"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/identifiers"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
        bolt "go.etcd.io/bbolt"
)

type containerStore struct {
        db *DB
}

// NewContainerStore returns a Store backed by an underlying bolt DB
func NewContainerStore(db *DB) containers.Store {
        return &containerStore{
                db: db,
        }
}

func (s *containerStore) Get(ctx context.Context, id string) (containers.Container, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return containers.Container{}, err
        }

        container := containers.Container{ID: id}

        if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getContainerBucket(tx, namespace, id)
                if bkt == nil {
                        return fmt.Errorf("container %q in namespace %q: %w", id, namespace, errdefs.ErrNotFound)
                }

                if err := readContainer(&container, bkt); err != nil {
                        return fmt.Errorf("failed to read container %q: %w", id, err)
                }

                return nil
        }); err != nil {
                return containers.Container{}, err
        }

        return container, nil
}

func (s *containerStore) List(ctx context.Context, fs ...string) ([]containers.Container, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return nil, fmt.Errorf("%s: %w", err.Error(), errdefs.ErrInvalidArgument)
        }

        var m []containers.Container

        if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getContainersBucket(tx, namespace)
                if bkt == nil {
                        return nil // empty store
                }

                return bkt.ForEach(func(k, v []byte) error {
                        cbkt := bkt.Bucket(k)
                        if cbkt == nil {
                                return nil
                        }
                        container := containers.Container{ID: string(k)}

                        if err := readContainer(&container, cbkt); err != nil {
                                return fmt.Errorf("failed to read container %q: %w", string(k), err)
                        }

                        if filter.Match(adaptContainer(container)) {
                                m = append(m, container)
                        }
                        return nil
                })
        }); err != nil {
                return nil, err
        }

        return m, nil
}

func (s *containerStore) Create(ctx context.Context, container containers.Container) (containers.Container, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return containers.Container{}, err
        }

        if err := validateContainer(&container); err != nil {
                return containers.Container{}, fmt.Errorf("create container failed validation: %w", err)
        }

        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt, err := createContainersBucket(tx, namespace)
                if err != nil {
                        return err
                }

                cbkt, err := bkt.CreateBucket([]byte(container.ID))
                if err != nil {
                        if err == bolt.ErrBucketExists {
                                err = fmt.Errorf("container %q: %w", container.ID, errdefs.ErrAlreadyExists)
                        }
                        return err
                }

                container.CreatedAt = time.Now().UTC()
                container.UpdatedAt = container.CreatedAt
                if err := writeContainer(cbkt, &container); err != nil {
                        return fmt.Errorf("failed to write container %q: %w", container.ID, err)
                }

                return nil
        }); err != nil {
                return containers.Container{}, err
        }

        return container, nil
}

func (s *containerStore) Update(ctx context.Context, container containers.Container, fieldpaths ...string) (containers.Container, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return containers.Container{}, err
        }

        if container.ID == "" {
                return containers.Container{}, fmt.Errorf("must specify a container id: %w", errdefs.ErrInvalidArgument)
        }

        var updated containers.Container
        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getContainersBucket(tx, namespace)
                if bkt == nil {
                        return fmt.Errorf("cannot update container %q in namespace %q: %w", container.ID, namespace, errdefs.ErrNotFound)
                }

                cbkt := bkt.Bucket([]byte(container.ID))
                if cbkt == nil {
                        return fmt.Errorf("container %q: %w", container.ID, errdefs.ErrNotFound)
                }

                if err := readContainer(&updated, cbkt); err != nil {
                        return fmt.Errorf("failed to read container %q: %w", container.ID, err)
                }
                createdat := updated.CreatedAt
                updated.ID = container.ID

                if len(fieldpaths) == 0 {
                        // only allow updates to these field on full replace.
                        fieldpaths = []string{"labels", "spec", "extensions", "image", "snapshotkey"}

                        // Fields that are immutable must cause an error when no field paths
                        // are provided. This allows these fields to become mutable in the
                        // future.
                        if updated.Snapshotter != container.Snapshotter {
                                return fmt.Errorf("container.Snapshotter field is immutable: %w", errdefs.ErrInvalidArgument)
                        }

                        if updated.Runtime.Name != container.Runtime.Name {
                                return fmt.Errorf("container.Runtime.Name field is immutable: %w", errdefs.ErrInvalidArgument)
                        }
                }

                // apply the field mask. If you update this code, you better follow the
                // field mask rules in field_mask.proto. If you don't know what this
                // is, do not update this code.
                for _, path := range fieldpaths {
                        if strings.HasPrefix(path, "labels.") {
                                if updated.Labels == nil {
                                        updated.Labels = map[string]string{}
                                }
                                key := strings.TrimPrefix(path, "labels.")
                                updated.Labels[key] = container.Labels[key]
                                continue
                        }

                        if strings.HasPrefix(path, "extensions.") {
                                if updated.Extensions == nil {
                                        updated.Extensions = map[string]typeurl.Any{}
                                }
                                key := strings.TrimPrefix(path, "extensions.")
                                updated.Extensions[key] = container.Extensions[key]
                                continue
                        }

                        switch path {
                        case "labels":
                                updated.Labels = container.Labels
                        case "spec":
                                updated.Spec = container.Spec
                        case "extensions":
                                updated.Extensions = container.Extensions
                        case "image":
                                updated.Image = container.Image
                        case "snapshotkey":
                                updated.SnapshotKey = container.SnapshotKey
                        default:
                                return fmt.Errorf("cannot update %q field on %q: %w", path, container.ID, errdefs.ErrInvalidArgument)
                        }
                }

                if err := validateContainer(&updated); err != nil {
                        return fmt.Errorf("update failed validation: %w", err)
                }

                updated.CreatedAt = createdat
                updated.UpdatedAt = time.Now().UTC()
                if err := writeContainer(cbkt, &updated); err != nil {
                        return fmt.Errorf("failed to write container %q: %w", container.ID, err)
                }

                return nil
        }); err != nil {
                return containers.Container{}, err
        }

        return updated, nil
}

func (s *containerStore) Delete(ctx context.Context, id string) error {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        return update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getContainersBucket(tx, namespace)
                if bkt == nil {
                        return fmt.Errorf("cannot delete container %q in namespace %q: %w", id, namespace, errdefs.ErrNotFound)
                }

                if err := bkt.DeleteBucket([]byte(id)); err != nil {
                        if err == bolt.ErrBucketNotFound {
                                err = fmt.Errorf("container %v: %w", id, errdefs.ErrNotFound)
                        }
                        return err
                }

                atomic.AddUint32(&s.db.dirty, 1)

                return nil
        })
}

func validateContainer(container *containers.Container) error {
        if err := identifiers.Validate(container.ID); err != nil {
                return fmt.Errorf("container.ID: %w", err)
        }

        for k := range container.Extensions {
                if k == "" {
                        return fmt.Errorf("container.Extension keys must not be zero-length: %w", errdefs.ErrInvalidArgument)
                }
        }

        // image has no validation
        for k, v := range container.Labels {
                if err := labels.Validate(k, v); err != nil {
                        return fmt.Errorf("containers.Labels: %w", err)
                }
        }

        if container.Runtime.Name == "" {
                return fmt.Errorf("container.Runtime.Name must be set: %w", errdefs.ErrInvalidArgument)
        }

        if container.Spec == nil {
                return fmt.Errorf("container.Spec must be set: %w", errdefs.ErrInvalidArgument)
        }

        if container.SnapshotKey != "" && container.Snapshotter == "" {
                return fmt.Errorf("container.Snapshotter must be set if container.SnapshotKey is set: %w", errdefs.ErrInvalidArgument)
        }

        return nil
}

func readContainer(container *containers.Container, bkt *bolt.Bucket) error {
        labels, err := boltutil.ReadLabels(bkt)
        if err != nil {
                return err
        }
        container.Labels = labels

        if err := boltutil.ReadTimestamps(bkt, &container.CreatedAt, &container.UpdatedAt); err != nil {
                return err
        }

        return bkt.ForEach(func(k, v []byte) error {
                switch string(k) {
                case string(bucketKeyImage):
                        container.Image = string(v)
                case string(bucketKeyRuntime):
                        rbkt := bkt.Bucket(bucketKeyRuntime)
                        if rbkt == nil {
                                return nil // skip runtime. should be an error?
                        }

                        n := rbkt.Get(bucketKeyName)
                        if n != nil {
                                container.Runtime.Name = string(n)
                        }

                        o, err := boltutil.ReadAny(rbkt, bucketKeyOptions)
                        if err != nil {
                                return err
                        }
                        container.Runtime.Options = o
                case string(bucketKeySpec):
                        var spec types.Any
                        if err := proto.Unmarshal(v, &spec); err != nil {
                                return err
                        }
                        container.Spec = &spec
                case string(bucketKeySnapshotKey):
                        container.SnapshotKey = string(v)
                case string(bucketKeySnapshotter):
                        container.Snapshotter = string(v)
                case string(bucketKeyExtensions):
                        extensions, err := boltutil.ReadExtensions(bkt)
                        if err != nil {
                                return err
                        }

                        container.Extensions = extensions
                case string(bucketKeySandboxID):
                        container.SandboxID = string(v)
                }

                return nil
        })
}

func writeContainer(bkt *bolt.Bucket, container *containers.Container) error {
        if err := boltutil.WriteTimestamps(bkt, container.CreatedAt, container.UpdatedAt); err != nil {
                return err
        }

        if err := boltutil.WriteAny(bkt, bucketKeySpec, container.Spec); err != nil {
                return err
        }

        for _, v := range [][2][]byte{
                {bucketKeyImage, []byte(container.Image)},
                {bucketKeySnapshotter, []byte(container.Snapshotter)},
                {bucketKeySnapshotKey, []byte(container.SnapshotKey)},
        } {
                if err := bkt.Put(v[0], v[1]); err != nil {
                        return err
                }
        }

        if rbkt := bkt.Bucket(bucketKeyRuntime); rbkt != nil {
                if err := bkt.DeleteBucket(bucketKeyRuntime); err != nil {
                        return err
                }
        }

        rbkt, err := bkt.CreateBucket(bucketKeyRuntime)
        if err != nil {
                return err
        }

        if err := rbkt.Put(bucketKeyName, []byte(container.Runtime.Name)); err != nil {
                return err
        }

        if err := boltutil.WriteExtensions(bkt, container.Extensions); err != nil {
                return err
        }

        if err := boltutil.WriteAny(rbkt, bucketKeyOptions, container.Runtime.Options); err != nil {
                return err
        }

        if err := bkt.Put(bucketKeySandboxID, []byte(container.SandboxID)); err != nil {
                return err
        }

        return boltutil.WriteLabels(bkt, container.Labels)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "encoding/binary"
        "fmt"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        bolt "go.etcd.io/bbolt"
)

type contentStore struct {
        content.Store
        db     *DB
        shared bool
        l      sync.RWMutex
}

// newContentStore returns a namespaced content store using an existing
// content store interface.
// policy defines the sharing behavior for content between namespaces. Both
// modes will result in shared storage in the backend for committed. Choose
// "shared" to prevent separate namespaces from having to pull the same content
// twice.  Choose "isolated" if the content must not be shared between
// namespaces.
//
// If the policy is "shared", writes will try to resolve the "expected" digest
// against the backend, allowing imports of content from other namespaces. In
// "isolated" mode, the client must prove they have the content by providing
// the entire blob before the content can be added to another namespace.
//
// Since we have only two policies right now, it's simpler using bool to
// represent it internally.
func newContentStore(db *DB, shared bool, cs content.Store) *contentStore {
        return &contentStore{
                Store:  cs,
                db:     db,
                shared: shared,
        }
}

func (cs *contentStore) Info(ctx context.Context, dgst digest.Digest) (content.Info, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return content.Info{}, err
        }

        var info content.Info
        if err := view(ctx, cs.db, func(tx *bolt.Tx) error {
                bkt := getBlobBucket(tx, ns, dgst)
                if bkt == nil {
                        return fmt.Errorf("content digest %v: %w", dgst, errdefs.ErrNotFound)
                }

                info.Digest = dgst
                return readInfo(&info, bkt)
        }); err != nil {
                return content.Info{}, err
        }

        return info, nil
}

func (cs *contentStore) Update(ctx context.Context, info content.Info, fieldpaths ...string) (content.Info, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return content.Info{}, err
        }

        cs.l.RLock()
        defer cs.l.RUnlock()

        updated := content.Info{
                Digest: info.Digest,
        }
        if err := update(ctx, cs.db, func(tx *bolt.Tx) error {
                bkt := getBlobBucket(tx, ns, info.Digest)
                if bkt == nil {
                        return fmt.Errorf("content digest %v: %w", info.Digest, errdefs.ErrNotFound)
                }

                if err := readInfo(&updated, bkt); err != nil {
                        return fmt.Errorf("info %q: %w", info.Digest, err)
                }

                if len(fieldpaths) > 0 {
                        for _, path := range fieldpaths {
                                if strings.HasPrefix(path, "labels.") {
                                        if updated.Labels == nil {
                                                updated.Labels = map[string]string{}
                                        }

                                        key := strings.TrimPrefix(path, "labels.")
                                        updated.Labels[key] = info.Labels[key]
                                        continue
                                }

                                switch path {
                                case "labels":
                                        updated.Labels = info.Labels
                                default:
                                        return fmt.Errorf("cannot update %q field on content info %q: %w", path, info.Digest, errdefs.ErrInvalidArgument)
                                }
                        }
                } else {
                        // Set mutable fields
                        updated.Labels = info.Labels
                }
                if err := validateInfo(&updated); err != nil {
                        return err
                }

                updated.UpdatedAt = time.Now().UTC()
                return writeInfo(&updated, bkt)
        }); err != nil {
                return content.Info{}, err
        }
        return updated, nil
}

func (cs *contentStore) Walk(ctx context.Context, fn content.WalkFunc, fs ...string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return err
        }

        // TODO: Batch results to keep from reading all info into memory
        var infos []content.Info
        if err := view(ctx, cs.db, func(tx *bolt.Tx) error {
                bkt := getBlobsBucket(tx, ns)
                if bkt == nil {
                        return nil
                }

                return bkt.ForEach(func(k, v []byte) error {
                        dgst, err := digest.Parse(string(k))
                        if err != nil {
                                // Not a digest, skip
                                return nil
                        }
                        bbkt := bkt.Bucket(k)
                        if bbkt == nil {
                                return nil
                        }
                        info := content.Info{
                                Digest: dgst,
                        }
                        if err := readInfo(&info, bkt.Bucket(k)); err != nil {
                                return err
                        }
                        if filter.Match(content.AdaptInfo(info)) {
                                infos = append(infos, info)
                        }
                        return nil
                })
        }); err != nil {
                return err
        }

        for _, info := range infos {
                if err := fn(info); err != nil {
                        return err
                }
        }

        return nil
}

func (cs *contentStore) Delete(ctx context.Context, dgst digest.Digest) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        cs.l.RLock()
        defer cs.l.RUnlock()

        return update(ctx, cs.db, func(tx *bolt.Tx) error {
                bkt := getBlobBucket(tx, ns, dgst)
                if bkt == nil {
                        return fmt.Errorf("content digest %v: %w", dgst, errdefs.ErrNotFound)
                }

                if err := getBlobsBucket(tx, ns).DeleteBucket([]byte(dgst.String())); err != nil {
                        return err
                }
                if err := removeContentLease(ctx, tx, dgst); err != nil {
                        return err
                }

                // Mark content store as dirty for triggering garbage collection
                atomic.AddUint32(&cs.db.dirty, 1)
                cs.db.dirtyCS = true

                return nil
        })
}

func (cs *contentStore) ListStatuses(ctx context.Context, fs ...string) ([]content.Status, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return nil, err
        }

        brefs := map[string]string{}
        if err := view(ctx, cs.db, func(tx *bolt.Tx) error {
                bkt := getIngestsBucket(tx, ns)
                if bkt == nil {
                        return nil
                }

                return bkt.ForEach(func(k, v []byte) error {
                        if v == nil {
                                // TODO(dmcgowan): match name and potentially labels here
                                brefs[string(k)] = string(bkt.Bucket(k).Get(bucketKeyRef))
                        }
                        return nil
                })
        }); err != nil {
                return nil, err
        }

        statuses := make([]content.Status, 0, len(brefs))
        for k, bref := range brefs {
                status, err := cs.Store.Status(ctx, bref)
                if err != nil {
                        if errdefs.IsNotFound(err) {
                                continue
                        }
                        return nil, err
                }
                status.Ref = k

                if filter.Match(adaptContentStatus(status)) {
                        statuses = append(statuses, status)
                }
        }

        return statuses, nil

}

func getRef(tx *bolt.Tx, ns, ref string) string {
        bkt := getIngestBucket(tx, ns, ref)
        if bkt == nil {
                return ""
        }
        v := bkt.Get(bucketKeyRef)
        if len(v) == 0 {
                return ""
        }
        return string(v)
}

func (cs *contentStore) Status(ctx context.Context, ref string) (content.Status, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return content.Status{}, err
        }

        var bref string
        if err := view(ctx, cs.db, func(tx *bolt.Tx) error {
                bref = getRef(tx, ns, ref)
                if bref == "" {
                        return fmt.Errorf("reference %v: %w", ref, errdefs.ErrNotFound)
                }

                return nil
        }); err != nil {
                return content.Status{}, err
        }

        st, err := cs.Store.Status(ctx, bref)
        if err != nil {
                return content.Status{}, err
        }
        st.Ref = ref
        return st, nil
}

func (cs *contentStore) Abort(ctx context.Context, ref string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        cs.l.RLock()
        defer cs.l.RUnlock()

        return update(ctx, cs.db, func(tx *bolt.Tx) error {
                ibkt := getIngestsBucket(tx, ns)
                if ibkt == nil {
                        return fmt.Errorf("reference %v: %w", ref, errdefs.ErrNotFound)
                }
                bkt := ibkt.Bucket([]byte(ref))
                if bkt == nil {
                        return fmt.Errorf("reference %v: %w", ref, errdefs.ErrNotFound)
                }
                bref := string(bkt.Get(bucketKeyRef))
                if bref == "" {
                        return fmt.Errorf("reference %v: %w", ref, errdefs.ErrNotFound)
                }
                expected := string(bkt.Get(bucketKeyExpected))
                if err := ibkt.DeleteBucket([]byte(ref)); err != nil {
                        return err
                }

                if err := removeIngestLease(ctx, tx, ref); err != nil {
                        return err
                }

                // if not shared content, delete active ingest on backend
                if expected == "" {
                        return cs.Store.Abort(ctx, bref)
                }

                return nil
        })

}

func (cs *contentStore) Writer(ctx context.Context, opts ...content.WriterOpt) (content.Writer, error) {
        var wOpts content.WriterOpts
        for _, opt := range opts {
                if err := opt(&wOpts); err != nil {
                        return nil, err
                }
        }
        // TODO(AkihiroSuda): we could create a random string or one calculated based on the context
        // https://github.com/containerd/containerd/issues/2129#issuecomment-380255019
        if wOpts.Ref == "" {
                return nil, fmt.Errorf("ref must not be empty: %w", errdefs.ErrInvalidArgument)
        }
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        cs.l.RLock()
        defer cs.l.RUnlock()

        var (
                w      content.Writer
                exists bool
                bref   string
        )
        if err := update(ctx, cs.db, func(tx *bolt.Tx) error {
                var shared bool
                if wOpts.Desc.Digest != "" {
                        cbkt := getBlobBucket(tx, ns, wOpts.Desc.Digest)
                        if cbkt != nil {
                                // Add content to lease to prevent other reference removals
                                // from effecting this object during a provided lease
                                if err := addContentLease(ctx, tx, wOpts.Desc.Digest); err != nil {
                                        return fmt.Errorf("unable to lease content: %w", err)
                                }
                                // Return error outside of transaction to ensure
                                // commit succeeds with the lease.
                                exists = true
                                return nil
                        }

                        if cs.shared || isSharedContent(tx, wOpts.Desc.Digest) {
                                if st, err := cs.Store.Info(ctx, wOpts.Desc.Digest); err == nil {
                                        // Ensure the expected size is the same, it is likely
                                        // an error if the size is mismatched but the caller
                                        // must resolve this on commit
                                        if wOpts.Desc.Size == 0 || wOpts.Desc.Size == st.Size {
                                                shared = true
                                                wOpts.Desc.Size = st.Size
                                        }
                                }
                        }
                }

                bkt, err := createIngestBucket(tx, ns, wOpts.Ref)
                if err != nil {
                        return err
                }

                leased, err := addIngestLease(ctx, tx, wOpts.Ref)
                if err != nil {
                        return err
                }

                brefb := bkt.Get(bucketKeyRef)
                if brefb == nil {
                        sid, err := bkt.NextSequence()
                        if err != nil {
                                return err
                        }

                        bref = createKey(sid, ns, wOpts.Ref)
                        if err := bkt.Put(bucketKeyRef, []byte(bref)); err != nil {
                                return err
                        }
                } else {
                        bref = string(brefb)
                }
                if !leased {
                        // Add timestamp to allow aborting once stale
                        // When lease is set the ingest should be aborted
                        // after lease it belonged to is deleted.
                        // Expiration can be configurable in the future to
                        // give more control to the daemon, however leases
                        // already give users more control of expiration.
                        expireAt := time.Now().UTC().Add(24 * time.Hour)
                        if err := writeExpireAt(expireAt, bkt); err != nil {
                                return err
                        }
                }

                if shared {
                        if err := bkt.Put(bucketKeyExpected, []byte(wOpts.Desc.Digest)); err != nil {
                                return err
                        }
                } else {
                        // Do not use the passed in expected value here since it was
                        // already checked against the user metadata. The content must
                        // be committed in the namespace before it will be seen as
                        // available in the current namespace.
                        desc := wOpts.Desc
                        desc.Digest = ""
                        w, err = cs.Store.Writer(ctx, content.WithRef(bref), content.WithDescriptor(desc))
                }
                return err
        }); err != nil {
                return nil, err
        }
        if exists {
                return nil, fmt.Errorf("content %v: %w", wOpts.Desc.Digest, errdefs.ErrAlreadyExists)
        }

        return &namespacedWriter{
                ctx:       ctx,
                ref:       wOpts.Ref,
                namespace: ns,
                db:        cs.db,
                provider:  cs.Store,
                l:         &cs.l,
                w:         w,
                bref:      bref,
                started:   time.Now(),
                desc:      wOpts.Desc,
        }, nil
}

type namespacedWriter struct {
        ctx       context.Context
        ref       string
        namespace string
        db        transactor
        provider  interface {
                content.Provider
                content.Ingester
        }
        l *sync.RWMutex

        w content.Writer

        bref    string
        started time.Time
        desc    ocispec.Descriptor
}

func (nw *namespacedWriter) Close() error {
        if nw.w != nil {
                return nw.w.Close()
        }
        return nil
}

func (nw *namespacedWriter) Write(p []byte) (int, error) {
        // if no writer, first copy and unshare before performing write
        if nw.w == nil {
                if len(p) == 0 {
                        return 0, nil
                }

                if err := nw.createAndCopy(nw.ctx, nw.desc); err != nil {
                        return 0, err
                }
        }

        return nw.w.Write(p)
}

func (nw *namespacedWriter) Digest() digest.Digest {
        if nw.w != nil {
                return nw.w.Digest()
        }
        return nw.desc.Digest
}

func (nw *namespacedWriter) Truncate(size int64) error {
        if nw.w != nil {
                return nw.w.Truncate(size)
        }
        desc := nw.desc
        desc.Size = size
        desc.Digest = ""
        return nw.createAndCopy(nw.ctx, desc)
}

func (nw *namespacedWriter) createAndCopy(ctx context.Context, desc ocispec.Descriptor) error {
        nwDescWithoutDigest := desc
        nwDescWithoutDigest.Digest = ""
        w, err := nw.provider.Writer(ctx, content.WithRef(nw.bref), content.WithDescriptor(nwDescWithoutDigest))
        if err != nil {
                return err
        }

        if desc.Size > 0 {
                ra, err := nw.provider.ReaderAt(ctx, nw.desc)
                if err != nil {
                        w.Close()
                        return err
                }
                defer ra.Close()

                if err := content.CopyReaderAt(w, ra, desc.Size); err != nil {
                        w.Close()
                        return err
                }
        }
        nw.w = w

        return nil
}

func (nw *namespacedWriter) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) error {
        ctx = namespaces.WithNamespace(ctx, nw.namespace)

        nw.l.RLock()
        defer nw.l.RUnlock()

        var innerErr error

        // We pre-sync the in-flight writes to the disk. This avoids the
        // subsequent fp.Sync() call[1]        from taking too long (10s+) while
        // holding the metadata database lock as in the following `update`
        // transaction.
        //
        // REF:
        // [1]: https://github.com/containerd/containerd/blob/c4c3c6ea568ce0cfbcf754863abadeea37d77c8f/plugins/content/local/writer.go#L95
        if err := nw.Sync(); err != nil {
                nw.Close()
                return fmt.Errorf("failed to perform sync: %w", err)
        }

        if err := update(ctx, nw.db, func(tx *bolt.Tx) error {
                dgst, err := nw.commit(ctx, tx, size, expected, opts...)
                if err != nil {
                        if !errdefs.IsAlreadyExists(err) {
                                return err
                        }
                        innerErr = err
                }
                bkt := getIngestsBucket(tx, nw.namespace)
                if bkt != nil {
                        if err := bkt.DeleteBucket([]byte(nw.ref)); err != nil && err != bolt.ErrBucketNotFound {
                                return err
                        }
                }
                if err := removeIngestLease(ctx, tx, nw.ref); err != nil {
                        return err
                }
                return addContentLease(ctx, tx, dgst)
        }); err != nil {
                return err
        }

        return innerErr
}

func (nw *namespacedWriter) Sync() error {
        if syncer, ok := nw.w.(content.Syncer); ok {
                return syncer.Sync()
        }
        return nil
}

func (nw *namespacedWriter) commit(ctx context.Context, tx *bolt.Tx, size int64, expected digest.Digest, opts ...content.Opt) (digest.Digest, error) {
        var base content.Info
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        if nw.w != nil {
                                nw.w.Close()
                        }
                        return "", err
                }
        }
        if err := validateInfo(&base); err != nil {
                if nw.w != nil {
                        nw.w.Close()
                }
                return "", err
        }

        var actual digest.Digest
        if nw.w == nil {
                if size != 0 && size != nw.desc.Size {
                        return "", fmt.Errorf("%q failed size validation: %v != %v: %w", nw.ref, nw.desc.Size, size, errdefs.ErrFailedPrecondition)
                }
                if expected != "" && expected != nw.desc.Digest {
                        return "", fmt.Errorf("%q unexpected digest: %w", nw.ref, errdefs.ErrFailedPrecondition)
                }
                size = nw.desc.Size
                actual = nw.desc.Digest
        } else {
                status, err := nw.w.Status()
                if err != nil {
                        nw.w.Close()
                        return "", err
                }
                if size != 0 && size != status.Offset {
                        nw.w.Close()
                        return "", fmt.Errorf("%q failed size validation: %v != %v: %w", nw.ref, status.Offset, size, errdefs.ErrFailedPrecondition)
                }
                size = status.Offset

                if err := nw.w.Commit(ctx, size, expected); err != nil && !errdefs.IsAlreadyExists(err) {
                        return "", err
                }
                actual = nw.w.Digest()
        }

        bkt, err := createBlobBucket(tx, nw.namespace, actual)
        if err != nil {
                if err == bolt.ErrBucketExists {
                        return actual, fmt.Errorf("content %v: %w", actual, errdefs.ErrAlreadyExists)
                }
                return "", err
        }

        commitTime := time.Now().UTC()

        sizeEncoded, err := encodeInt(size)
        if err != nil {
                return "", err
        }

        if err := boltutil.WriteTimestamps(bkt, commitTime, commitTime); err != nil {
                return "", err
        }
        if err := boltutil.WriteLabels(bkt, base.Labels); err != nil {
                return "", err
        }
        return actual, bkt.Put(bucketKeySize, sizeEncoded)
}

func (nw *namespacedWriter) Status() (st content.Status, err error) {
        if nw.w != nil {
                st, err = nw.w.Status()
        } else {
                st.Offset = nw.desc.Size
                st.Total = nw.desc.Size
                st.StartedAt = nw.started
                st.UpdatedAt = nw.started
                st.Expected = nw.desc.Digest
        }
        if err == nil {
                st.Ref = nw.ref
        }
        return
}

func (cs *contentStore) ReaderAt(ctx context.Context, desc ocispec.Descriptor) (content.ReaderAt, error) {
        if err := cs.checkAccess(ctx, desc.Digest); err != nil {
                return nil, err
        }
        return cs.Store.ReaderAt(ctx, desc)
}

func (cs *contentStore) checkAccess(ctx context.Context, dgst digest.Digest) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        return view(ctx, cs.db, func(tx *bolt.Tx) error {
                bkt := getBlobBucket(tx, ns, dgst)
                if bkt == nil {
                        return fmt.Errorf("content digest %v: %w", dgst, errdefs.ErrNotFound)
                }
                return nil
        })
}

func isSharedContent(tx *bolt.Tx, dgst digest.Digest) bool {
        v1bkt := tx.Bucket(bucketKeyVersion)
        if v1bkt == nil {
                return false
        }
        // iterate through each namespace
        v1c := v1bkt.Cursor()
        for nk, _ := v1c.First(); nk != nil; nk, _ = v1c.Next() {
                ns := string(nk)
                lbkt := getNamespaceLabelsBucket(tx, ns)
                if lbkt == nil {
                        continue
                }
                if sharedNS := lbkt.Get([]byte(labels.LabelSharedNamespace)); sharedNS != nil && string(sharedNS) == "true" && getBlobBucket(tx, ns, dgst) != nil {
                        return true
                }
        }
        return false
}

func validateInfo(info *content.Info) error {
        for k, v := range info.Labels {
                if err := labels.Validate(k, v); err != nil {
                        return fmt.Errorf("info.Labels: %w", err)
                }
        }

        return nil
}

func readInfo(info *content.Info, bkt *bolt.Bucket) error {
        if err := boltutil.ReadTimestamps(bkt, &info.CreatedAt, &info.UpdatedAt); err != nil {
                return err
        }

        labels, err := boltutil.ReadLabels(bkt)
        if err != nil {
                return err
        }
        info.Labels = labels

        if v := bkt.Get(bucketKeySize); len(v) > 0 {
                info.Size, _ = binary.Varint(v)
        }

        return nil
}

func writeInfo(info *content.Info, bkt *bolt.Bucket) error {
        if err := boltutil.WriteTimestamps(bkt, info.CreatedAt, info.UpdatedAt); err != nil {
                return err
        }

        if err := boltutil.WriteLabels(bkt, info.Labels); err != nil {
                return fmt.Errorf("writing labels for info %v: %w", info.Digest, err)
        }

        // Write size
        sizeEncoded, err := encodeInt(info.Size)
        if err != nil {
                return err
        }

        return bkt.Put(bucketKeySize, sizeEncoded)
}

func readExpireAt(bkt *bolt.Bucket) (*time.Time, error) {
        v := bkt.Get(bucketKeyExpireAt)
        if v == nil {
                return nil, nil
        }
        t := &time.Time{}
        if err := t.UnmarshalBinary(v); err != nil {
                return nil, err
        }
        return t, nil
}

func writeExpireAt(expire time.Time, bkt *bolt.Bucket) error {
        expireAt, err := expire.MarshalBinary()
        if err != nil {
                return err
        }
        return bkt.Put(bucketKeyExpireAt, expireAt)
}

// garbageCollect removes all contents that are no longer used.
func (cs *contentStore) garbageCollect(ctx context.Context) (d time.Duration, err error) {
        cs.l.Lock()
        t1 := time.Now()
        defer func() {
                if err == nil {
                        d = time.Since(t1)
                }
                cs.l.Unlock()
        }()

        contentSeen := map[string]struct{}{}
        ingestSeen := map[string]struct{}{}
        if err := cs.db.View(func(tx *bolt.Tx) error {
                v1bkt := tx.Bucket(bucketKeyVersion)
                if v1bkt == nil {
                        return nil
                }

                // iterate through each namespace
                v1c := v1bkt.Cursor()

                for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
                        if v != nil {
                                continue
                        }

                        cbkt := v1bkt.Bucket(k).Bucket(bucketKeyObjectContent)
                        if cbkt == nil {
                                continue
                        }
                        bbkt := cbkt.Bucket(bucketKeyObjectBlob)
                        if bbkt != nil {
                                if err := bbkt.ForEach(func(ck, cv []byte) error {
                                        if cv == nil {
                                                contentSeen[string(ck)] = struct{}{}
                                        }
                                        return nil
                                }); err != nil {
                                        return err
                                }
                        }

                        ibkt := cbkt.Bucket(bucketKeyObjectIngests)
                        if ibkt != nil {
                                if err := ibkt.ForEach(func(ref, v []byte) error {
                                        if v == nil {
                                                bkt := ibkt.Bucket(ref)
                                                // expected here may be from a different namespace
                                                // so much be explicitly retained from the ingest
                                                // in case it was removed from the other namespace
                                                expected := bkt.Get(bucketKeyExpected)
                                                if len(expected) > 0 {
                                                        contentSeen[string(expected)] = struct{}{}
                                                }
                                                bref := bkt.Get(bucketKeyRef)
                                                if len(bref) > 0 {
                                                        ingestSeen[string(bref)] = struct{}{}
                                                }
                                        }
                                        return nil
                                }); err != nil {
                                        return err
                                }
                        }
                }

                return nil
        }); err != nil {
                return 0, err
        }

        err = cs.Store.Walk(ctx, func(info content.Info) error {
                if _, ok := contentSeen[info.Digest.String()]; !ok {
                        if err := cs.Store.Delete(ctx, info.Digest); err != nil {
                                return err
                        }
                        log.G(ctx).WithField("digest", info.Digest).Debug("removed content")
                }
                return nil
        })
        if err != nil {
                return
        }

        // If the content store has implemented a more efficient walk function
        // then use that else fallback to reading all statuses which may
        // cause reading of unneeded metadata.
        type statusWalker interface {
                WalkStatusRefs(context.Context, func(string) error) error
        }
        if w, ok := cs.Store.(statusWalker); ok {
                err = w.WalkStatusRefs(ctx, func(ref string) error {
                        if _, ok := ingestSeen[ref]; !ok {
                                if err := cs.Store.Abort(ctx, ref); err != nil {
                                        return err
                                }
                                log.G(ctx).WithField("ref", ref).Debug("cleanup aborting ingest")
                        }
                        return nil
                })
        } else {
                var statuses []content.Status
                statuses, err = cs.Store.ListStatuses(ctx)
                if err != nil {
                        return 0, err
                }
                for _, status := range statuses {
                        if _, ok := ingestSeen[status.Ref]; !ok {
                                if err = cs.Store.Abort(ctx, status.Ref); err != nil {
                                        return
                                }
                                log.G(ctx).WithField("ref", status.Ref).Debug("cleanup aborting ingest")
                        }
                }
        }
        return
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "encoding/binary"
        "errors"
        "fmt"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/cleanup"
        "github.com/containerd/containerd/v2/pkg/gc"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/log"
        bolt "go.etcd.io/bbolt"
)

const (
        // schemaVersion represents the schema version of
        // the database. This schema version represents the
        // structure of the data in the database. The schema
        // can envolve at any time but any backwards
        // incompatible changes or structural changes require
        // bumping the schema version.
        schemaVersion = "v1"

        // dbVersion represents updates to the schema
        // version which are additions and compatible with
        // prior version of the same schema.
        dbVersion = 3
)

// DBOpt configures how we set up the DB
type DBOpt func(*dbOptions)

// WithPolicyIsolated isolates contents between namespaces
func WithPolicyIsolated(o *dbOptions) {
        o.shared = false
}

// WithEventsPublisher adds an events publisher to the
// metadata db to directly publish events
func WithEventsPublisher(p events.Publisher) DBOpt {
        return func(o *dbOptions) {
                o.publisher = p
        }
}

// dbOptions configure db options.
type dbOptions struct {
        shared    bool
        publisher events.Publisher
}

// DB represents a metadata database backed by a bolt
// database. The database is fully namespaced and stores
// image, container, namespace, snapshot, and content data
// while proxying data shared across namespaces to backend
// datastores for content and snapshots.
type DB struct {
        db *bolt.DB
        ss map[string]*snapshotter
        cs *contentStore

        // wlock is used to protect access to the data structures during garbage
        // collection. While the wlock is held no writable transactions can be
        // opened, preventing changes from occurring between the mark and
        // sweep phases without preventing read transactions.
        wlock sync.RWMutex

        // dirty flag indicates that references have been removed which require
        // a garbage collection to ensure the database is clean. This tracks
        // the number of dirty operations. This should be updated and read
        // atomically if outside of wlock.Lock.
        dirty uint32

        // dirtySS and dirtyCS flags keeps track of datastores which have had
        // deletions since the last garbage collection. These datastores will
        // be garbage collected during the next garbage collection. These
        // should only be updated inside of a write transaction or wlock.Lock.
        dirtySS map[string]struct{}
        dirtyCS bool

        // mutationCallbacks are called after each mutation with the flag
        // set indicating whether any dirty flags are set
        mutationCallbacks []func(bool)

        // collectible resources
        collectors map[gc.ResourceType]Collector

        dbopts dbOptions
}

// NewDB creates a new metadata database using the provided
// bolt database, content store, and snapshotters.
func NewDB(db *bolt.DB, cs content.Store, ss map[string]snapshots.Snapshotter, opts ...DBOpt) *DB {
        m := &DB{
                db:      db,
                ss:      make(map[string]*snapshotter, len(ss)),
                dirtySS: map[string]struct{}{},
                dbopts: dbOptions{
                        shared: true,
                },
        }

        for _, opt := range opts {
                opt(&m.dbopts)
        }

        // Initialize data stores
        m.cs = newContentStore(m, m.dbopts.shared, cs)
        for name, sn := range ss {
                m.ss[name] = newSnapshotter(m, name, sn)
        }

        return m
}

// Init ensures the database is at the correct version
// and performs any needed migrations.
func (m *DB) Init(ctx context.Context) error {
        // errSkip is used when no migration or version needs to be written
        // to the database and the transaction can be immediately rolled
        // back rather than performing a much slower and unnecessary commit.
        var errSkip = errors.New("skip update")

        err := m.db.Update(func(tx *bolt.Tx) error {
                var (
                        // current schema and version
                        schema  = "v0"
                        version = 0
                )

                // i represents the index of the first migration
                // which must be run to get the database up to date.
                // The migration's version will be checked in reverse
                // order, decrementing i for each migration which
                // represents a version newer than the current
                // database version
                i := len(migrations)

                for ; i > 0; i-- {
                        migration := migrations[i-1]

                        bkt := tx.Bucket([]byte(migration.schema))
                        if bkt == nil {
                                // Hasn't encountered another schema, go to next migration
                                if schema == "v0" {
                                        continue
                                }
                                break
                        }
                        if schema == "v0" {
                                schema = migration.schema
                                vb := bkt.Get(bucketKeyDBVersion)
                                if vb != nil {
                                        v, _ := binary.Varint(vb)
                                        version = int(v)
                                }
                        }

                        if version >= migration.version {
                                break
                        }
                }

                // Previous version of database found
                if schema != "v0" {
                        updates := migrations[i:]

                        // No migration updates, return immediately
                        if len(updates) == 0 {
                                return errSkip
                        }

                        for _, m := range updates {
                                t0 := time.Now()
                                if err := m.migrate(tx); err != nil {
                                        return fmt.Errorf("failed to migrate to %s.%d: %w", m.schema, m.version, err)
                                }
                                log.G(ctx).WithField("d", time.Since(t0)).Debugf("finished database migration to %s.%d", m.schema, m.version)
                        }
                }

                bkt, err := tx.CreateBucketIfNotExists(bucketKeyVersion)
                if err != nil {
                        return err
                }

                versionEncoded, err := encodeInt(dbVersion)
                if err != nil {
                        return err
                }

                return bkt.Put(bucketKeyDBVersion, versionEncoded)
        })
        if err == errSkip {
                err = nil
        }
        return err
}

// ContentStore returns a namespaced content store
// proxied to a content store.
func (m *DB) ContentStore() content.Store {
        if m.cs == nil {
                return nil
        }
        return m.cs
}

// Snapshotter returns a snapshotter for the requested snapshotter name
// proxied to a snapshotter.
func (m *DB) Snapshotter(name string) snapshots.Snapshotter {
        sn, ok := m.ss[name]
        if !ok {
                return nil
        }
        return sn
}

// Snapshotters returns all available snapshotters.
func (m *DB) Snapshotters() map[string]snapshots.Snapshotter {
        ss := make(map[string]snapshots.Snapshotter, len(m.ss))
        for n, sn := range m.ss {
                ss[n] = sn
        }
        return ss
}

// View runs a readonly transaction on the metadata store.
func (m *DB) View(fn func(*bolt.Tx) error) error {
        return m.db.View(fn)
}

// Update runs a writable transaction on the metadata store.
func (m *DB) Update(fn func(*bolt.Tx) error) error {
        m.wlock.RLock()
        defer m.wlock.RUnlock()
        err := m.db.Update(fn)
        if err == nil {
                dirty := atomic.LoadUint32(&m.dirty) > 0
                for _, fn := range m.mutationCallbacks {
                        fn(dirty)
                }
        }

        return err
}

// Publisher returns an event publisher if one is configured
// and the current context is not inside a transaction.
func (m *DB) Publisher(ctx context.Context) events.Publisher {
        _, ok := ctx.Value(transactionKey{}).(*bolt.Tx)
        if ok {
                // Do no publish events within a transaction
                return nil
        }
        if m.dbopts.publisher != nil {
                return m.dbopts.publisher
        }
        return nil
}

// RegisterMutationCallback registers a function to be called after a metadata
// mutations has been performed.
//
// The callback function is an argument for whether a deletion has occurred
// since the last garbage collection.
func (m *DB) RegisterMutationCallback(fn func(bool)) {
        m.wlock.Lock()
        m.mutationCallbacks = append(m.mutationCallbacks, fn)
        m.wlock.Unlock()
}

// RegisterCollectibleResource registers a resource type which can be
// referenced by metadata resources and garbage collected.
// Collectible Resources are useful ephemeral resources which need to
// be tracked by go away after reboot or process restart.
//
// A few limitations to consider:
//   - Collectible Resources cannot reference other resources.
//   - A failure to complete collection will not fail the garbage collection,
//     however, the resources can be collected in a later run.
//   - Collectible Resources must track whether the resource is active and/or
//     lease membership.
func (m *DB) RegisterCollectibleResource(t gc.ResourceType, c Collector) {
        if t < resourceEnd {
                panic("cannot re-register metadata resource")
        } else if t >= gc.ResourceMax {
                panic("resource type greater than max")
        }

        m.wlock.Lock()
        defer m.wlock.Unlock()

        if m.collectors == nil {
                m.collectors = map[gc.ResourceType]Collector{}
        }

        if _, ok := m.collectors[t]; ok {
                panic("cannot register collectible type twice")
        }
        m.collectors[t] = c
}

// namespacedEvent is used to handle any event for a namespace
type namespacedEvent struct {
        namespace string
        event     interface{}
}

func (m *DB) publishEvents(events []namespacedEvent) {
        ctx := context.Background()
        if publisher := m.dbopts.publisher; publisher != nil {
                for _, ne := range events {
                        ctx := namespaces.WithNamespace(ctx, ne.namespace)
                        var topic string
                        switch ne.event.(type) {
                        case *eventstypes.ImageDelete:
                                topic = "/images/delete"
                        case *eventstypes.SnapshotRemove:
                                topic = "/snapshot/remove"
                        default:
                                log.G(ctx).WithField("event", ne.event).Debug("unhandled event type from garbage collection removal")
                                continue
                        }
                        if err := publisher.Publish(ctx, topic, ne.event); err != nil {
                                log.G(ctx).WithError(err).WithField("topic", topic).Debug("publish event failed")
                        }
                }
        }
}

// GCStats holds the duration for the different phases of the garbage collector
type GCStats struct {
        MetaD     time.Duration
        ContentD  time.Duration
        SnapshotD map[string]time.Duration
}

// Elapsed returns the duration which elapsed during a collection
func (s GCStats) Elapsed() time.Duration {
        return s.MetaD
}

// GarbageCollect removes resources (snapshots, contents, ...) that are no longer used.
func (m *DB) GarbageCollect(ctx context.Context) (gc.Stats, error) {
        m.wlock.Lock()
        t1 := time.Now()
        c := startGCContext(ctx, m.collectors)

        marked, err := m.getMarked(ctx, c) // Pass in gc context
        if err != nil {
                m.wlock.Unlock()
                c.cancel(ctx)
                return nil, err
        }

        events := []namespacedEvent{}
        if err := m.db.Update(func(tx *bolt.Tx) error {
                ctx, cancel := context.WithCancel(ctx)
                defer cancel()

                rm := func(ctx context.Context, n gc.Node) error {
                        if _, ok := marked[n]; ok {
                                return nil
                        }

                        if n.Type == ResourceSnapshot {
                                if idx := strings.IndexRune(n.Key, '/'); idx > 0 {
                                        m.dirtySS[n.Key[:idx]] = struct{}{}
                                }
                                // queue event to publish after successful commit
                        } else if n.Type == ResourceContent || n.Type == ResourceIngest {
                                m.dirtyCS = true
                        }

                        event, err := c.remove(ctx, tx, n)
                        if event != nil && err == nil {
                                events = append(events,
                                        namespacedEvent{
                                                namespace: n.Namespace,
                                                event:     event,
                                        })
                        }
                        return err
                }

                if err := c.scanAll(ctx, tx, rm); err != nil { // From gc context
                        return fmt.Errorf("failed to scan and remove: %w", err)
                }

                return nil
        }); err != nil {
                m.wlock.Unlock()
                c.cancel(ctx)
                return nil, err
        }

        var stats GCStats
        var wg sync.WaitGroup

        // Flush events asynchronously after commit
        wg.Add(1)
        go func() {
                m.publishEvents(events)
                wg.Done()
        }()

        // reset dirty, no need for atomic inside of wlock.Lock
        m.dirty = 0

        if len(m.dirtySS) > 0 {
                var sl sync.Mutex
                stats.SnapshotD = map[string]time.Duration{}
                wg.Add(len(m.dirtySS))
                for snapshotterName := range m.dirtySS {
                        log.G(ctx).WithField("snapshotter", snapshotterName).Debug("schedule snapshotter cleanup")
                        go func(snapshotterName string) {
                                st1 := time.Now()
                                m.cleanupSnapshotter(ctx, snapshotterName)

                                sl.Lock()
                                stats.SnapshotD[snapshotterName] = time.Since(st1)
                                sl.Unlock()

                                wg.Done()
                        }(snapshotterName)
                }
                m.dirtySS = map[string]struct{}{}
        }

        if m.dirtyCS {
                wg.Add(1)
                log.G(ctx).Debug("schedule content cleanup")
                go func() {
                        ct1 := time.Now()
                        m.cleanupContent(ctx)
                        stats.ContentD = time.Since(ct1)
                        wg.Done()
                }()
                m.dirtyCS = false
        }

        stats.MetaD = time.Since(t1)
        m.wlock.Unlock()

        c.finish(ctx)

        wg.Wait()

        return stats, err
}

// getMarked returns all resources that are used.
func (m *DB) getMarked(ctx context.Context, c *gcContext) (map[gc.Node]struct{}, error) {
        var marked map[gc.Node]struct{}
        if err := m.db.View(func(tx *bolt.Tx) error {
                ctx, cancel := context.WithCancel(ctx)
                defer cancel()

                var (
                        nodes []gc.Node
                        wg    sync.WaitGroup
                        roots = make(chan gc.Node)
                )
                wg.Add(1)
                go func() {
                        defer wg.Done()
                        for n := range roots {
                                nodes = append(nodes, n)
                        }
                }()
                // Call roots
                if err := c.scanRoots(ctx, tx, roots); err != nil { // From gc context
                        cancel()
                        return err
                }
                close(roots)
                wg.Wait()

                refs := func(n gc.Node) ([]gc.Node, error) {
                        var sn []gc.Node
                        if err := c.references(ctx, tx, n, func(nn gc.Node) { // From gc context
                                sn = append(sn, nn)
                        }); err != nil {
                                return nil, err
                        }
                        return sn, nil
                }

                reachable, err := gc.Tricolor(nodes, refs)
                if err != nil {
                        return err
                }
                marked = reachable
                return nil
        }); err != nil {
                return nil, err
        }
        return marked, nil
}

func (m *DB) cleanupSnapshotter(ctx context.Context, name string) (time.Duration, error) {
        ctx = cleanup.Background(ctx)
        sn, ok := m.ss[name]
        if !ok {
                return 0, nil
        }

        d, err := sn.garbageCollect(ctx)
        logger := log.G(ctx).WithField("snapshotter", name)
        if err != nil {
                logger.WithError(err).Warn("snapshot garbage collection failed")
        } else {
                logger.WithField("d", d).Tracef("snapshot garbage collected")
        }
        return d, err
}

func (m *DB) cleanupContent(ctx context.Context) (time.Duration, error) {
        ctx = cleanup.Background(ctx)
        if m.cs == nil {
                return 0, nil
        }

        d, err := m.cs.garbageCollect(ctx)
        if err != nil {
                log.G(ctx).WithError(err).Warn("content garbage collection failed")
        } else {
                log.G(ctx).WithField("d", d).Tracef("content garbage collected")
        }

        return d, err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "bytes"
        "context"
        "fmt"
        "sort"
        "strings"
        "time"

        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/pkg/gc"
        "github.com/containerd/log"
        bolt "go.etcd.io/bbolt"
)

const (
        // ResourceUnknown specifies an unknown resource
        ResourceUnknown gc.ResourceType = iota
        // ResourceContent specifies a content resource
        ResourceContent
        // ResourceSnapshot specifies a snapshot resource
        ResourceSnapshot
        // ResourceContainer specifies a container resource
        ResourceContainer
        // ResourceTask specifies a task resource
        ResourceTask
        // ResourceImage specifies an image
        ResourceImage
        // ResourceLease specifies a lease
        ResourceLease
        // ResourceIngest specifies a content ingest
        ResourceIngest
        // resourceEnd is the end of specified resource types
        resourceEnd
        // ResourceStream specifies a stream
        ResourceStream
)

const (
        resourceContentFlat  = ResourceContent | 0x20
        resourceSnapshotFlat = ResourceSnapshot | 0x20
        resourceImageFlat    = ResourceImage | 0x20
)

var (
        labelGCRoot       = []byte("containerd.io/gc.root")
        labelGCRef        = []byte("containerd.io/gc.ref.")
        labelGCSnapRef    = []byte("containerd.io/gc.ref.snapshot.")
        labelGCContentRef = []byte("containerd.io/gc.ref.content")
        labelGCImageRef   = []byte("containerd.io/gc.ref.image")

        // labelGCExpire indicates that an object is collectible after the
        // provided time. For image objects, this makes them available to
        // garbage collect when expired, when not provided, image objects
        // are root objects that never expire. For non-root objects such
        // as content or snapshots, these objects will be treated like
        // root objects before their expiration.
        // Expected format is RFC 3339
        labelGCExpire = []byte("containerd.io/gc.expire")

        // labelGCFlat indicates that a lease is flat and only intends to
        // lease the referenced objects, not their references. This can be
        // used to avoid leasing an entire tree of objects when only the root
        // object is needed.
        labelGCFlat = []byte("containerd.io/gc.flat")
)

// CollectionContext manages a resource collection during a single run of
// the garbage collector. The context is responsible for managing access to
// resources as well as tracking removal.
// Implementations should defer any longer running operations to the Finish
// function and optimize other functions for running fast during garbage
// collection write locks.
type CollectionContext interface {
        // All sends all known resources
        All(func(gc.Node))

        // Active sends all active resources
        // Leased resources may be excluded since lease ownership should take
        // precedence over active status.
        Active(namespace string, fn func(gc.Node))

        // Leased sends all resources associated with the given lease
        Leased(namespace, lease string, fn func(gc.Node))

        // Remove marks the given resource as removed
        Remove(gc.Node)

        // Cancel is called to cleanup a context after a failed collection
        Cancel() error

        // Finish is called to cleanup a context after a successful collection
        Finish() error
}

// Collector is an interface to manage resource collection for any collectible
// resource registered for garbage collection.
type Collector interface {
        StartCollection(context.Context) (CollectionContext, error)

        ReferenceLabel() string
}

type gcContext struct {
        labelHandlers []referenceLabelHandler
        contexts      map[gc.ResourceType]CollectionContext
}

type referenceLabelHandler struct {
        key []byte
        fn  func(string, []byte, []byte, func(gc.Node))
}

func startGCContext(ctx context.Context, collectors map[gc.ResourceType]Collector) *gcContext {
        var contexts map[gc.ResourceType]CollectionContext
        labelHandlers := []referenceLabelHandler{
                {
                        key: labelGCContentRef,
                        fn: func(ns string, k, v []byte, fn func(gc.Node)) {
                                if ks := string(k); ks != string(labelGCContentRef) {
                                        // Allow reference naming separated by . or /, ignore names
                                        if ks[len(labelGCContentRef)] != '.' && ks[len(labelGCContentRef)] != '/' {
                                                return
                                        }
                                }

                                fn(gcnode(ResourceContent, ns, string(v)))
                        },
                },
                {
                        key: labelGCSnapRef,
                        fn: func(ns string, k, v []byte, fn func(gc.Node)) {
                                snapshotter := k[len(labelGCSnapRef):]
                                if i := bytes.IndexByte(snapshotter, '/'); i >= 0 {
                                        snapshotter = snapshotter[:i]
                                }
                                fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, v)))
                        },
                },
                {
                        key: labelGCImageRef,
                        fn: func(ns string, k, v []byte, fn func(gc.Node)) {
                                if ks := string(k); ks != string(labelGCImageRef) {
                                        // Allow reference naming separated by . or /, ignore names
                                        if ks[len(labelGCImageRef)] != '.' && ks[len(labelGCImageRef)] != '/' {
                                                return
                                        }
                                }

                                fn(gcnode(ResourceImage, ns, string(v)))
                        },
                },
        }
        if len(collectors) > 0 {
                contexts = map[gc.ResourceType]CollectionContext{}
                for rt, collector := range collectors {
                        rt := rt
                        c, err := collector.StartCollection(ctx)
                        if err != nil {
                                // Only skipping this resource this round
                                continue
                        }

                        if reflabel := collector.ReferenceLabel(); reflabel != "" {
                                key := append(labelGCRef, reflabel...)
                                labelHandlers = append(labelHandlers, referenceLabelHandler{
                                        key: key,
                                        fn: func(ns string, k, v []byte, fn func(gc.Node)) {
                                                if ks := string(k); ks != string(key) {
                                                        // Allow reference naming separated by . or /, ignore names
                                                        if ks[len(key)] != '.' && ks[len(key)] != '/' {
                                                                return
                                                        }
                                                }

                                                fn(gcnode(rt, ns, string(v)))
                                        },
                                })
                        }
                        contexts[rt] = c
                }
                // Sort labelHandlers to ensure key seeking is always forward
                sort.Slice(labelHandlers, func(i, j int) bool {
                        return bytes.Compare(labelHandlers[i].key, labelHandlers[j].key) < 0
                })
        }
        return &gcContext{
                labelHandlers: labelHandlers,
                contexts:      contexts,
        }
}

func (c *gcContext) all(fn func(gc.Node)) {
        for _, gctx := range c.contexts {
                gctx.All(fn)
        }
}

func (c *gcContext) active(namespace string, fn func(gc.Node)) {
        for _, gctx := range c.contexts {
                gctx.Active(namespace, fn)
        }
}

func (c *gcContext) leased(namespace, lease string, fn func(gc.Node)) {
        for _, gctx := range c.contexts {
                gctx.Leased(namespace, lease, fn)
        }
}

func (c *gcContext) cancel(ctx context.Context) {
        for _, gctx := range c.contexts {
                if err := gctx.Cancel(); err != nil {
                        log.G(ctx).WithError(err).Error("failed to cancel collection context")
                }
        }
}

func (c *gcContext) finish(ctx context.Context) {
        for _, gctx := range c.contexts {
                if err := gctx.Finish(); err != nil {
                        log.G(ctx).WithError(err).Error("failed to finish collection context")
                }
        }
}

// scanRoots sends the given channel "root" resources that are certainly used.
// The caller could look the references of the resources to find all resources that are used.
func (c *gcContext) scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
        v1bkt := tx.Bucket(bucketKeyVersion)
        if v1bkt == nil {
                return nil
        }

        expThreshold := time.Now()

        // iterate through each namespace
        v1c := v1bkt.Cursor()

        // cerr indicates the scan did not successfully send all
        // the roots. The scan does not need to be cancelled but
        // must return error at the end.
        var cerr error
        fn := func(n gc.Node) {
                select {
                case nc <- n:
                case <-ctx.Done():
                        cerr = ctx.Err()
                }
        }

        for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
                if v != nil {
                        continue
                }
                nbkt := v1bkt.Bucket(k)
                ns := string(k)

                lbkt := nbkt.Bucket(bucketKeyObjectLeases)
                if lbkt != nil {
                        if err := lbkt.ForEach(func(k, v []byte) error {
                                if v != nil {
                                        return nil
                                }
                                libkt := lbkt.Bucket(k)
                                var flat bool

                                if lblbkt := libkt.Bucket(bucketKeyObjectLabels); lblbkt != nil {
                                        if expV := lblbkt.Get(labelGCExpire); expV != nil {
                                                exp, err := time.Parse(time.RFC3339, string(expV))
                                                if err != nil {
                                                        // label not used, log and continue to use lease
                                                        log.G(ctx).WithError(err).WithField("lease", string(k)).Infof("ignoring invalid expiration value %q", string(expV))
                                                } else if expThreshold.After(exp) {
                                                        // lease has expired, skip
                                                        log.G(ctx).WithField("lease", string(k)).Debug("expired lease")
                                                        return nil
                                                }
                                        }

                                        if flatV := lblbkt.Get(labelGCFlat); flatV != nil {
                                                flat = true
                                        }
                                }

                                fn(gcnode(ResourceLease, ns, string(k)))

                                // Emit content and snapshots as roots instead of implementing
                                // in references. Since leases cannot be referenced there is
                                // no need to allow the lookup to be recursive, handling here
                                // therefore reduces the number of database seeks.

                                ctype := ResourceContent
                                if flat {
                                        ctype = resourceContentFlat
                                }

                                cbkt := libkt.Bucket(bucketKeyObjectContent)
                                if cbkt != nil {
                                        if err := cbkt.ForEach(func(k, v []byte) error {
                                                fn(gcnode(ctype, ns, string(k)))
                                                return nil
                                        }); err != nil {
                                                return err
                                        }
                                }

                                stype := ResourceSnapshot
                                if flat {
                                        stype = resourceSnapshotFlat
                                }

                                sbkt := libkt.Bucket(bucketKeyObjectSnapshots)
                                if sbkt != nil {
                                        if err := sbkt.ForEach(func(sk, sv []byte) error {
                                                if sv != nil {
                                                        return nil
                                                }
                                                snbkt := sbkt.Bucket(sk)

                                                return snbkt.ForEach(func(k, v []byte) error {
                                                        fn(gcnode(stype, ns, fmt.Sprintf("%s/%s", sk, k)))
                                                        return nil
                                                })
                                        }); err != nil {
                                                return err
                                        }
                                }

                                ibkt := libkt.Bucket(bucketKeyObjectIngests)
                                if ibkt != nil {
                                        if err := ibkt.ForEach(func(k, v []byte) error {
                                                fn(gcnode(ResourceIngest, ns, string(k)))
                                                return nil
                                        }); err != nil {
                                                return err
                                        }
                                }

                                itype := ResourceImage
                                if flat {
                                        itype = resourceImageFlat
                                }

                                ibkt = libkt.Bucket(bucketKeyObjectImages)
                                if ibkt != nil {
                                        if err := ibkt.ForEach(func(k, v []byte) error {
                                                fn(gcnode(itype, ns, string(k)))
                                                return nil
                                        }); err != nil {
                                                return err
                                        }
                                }

                                c.leased(ns, string(k), fn)

                                return nil
                        }); err != nil {
                                return err
                        }
                }

                ibkt := nbkt.Bucket(bucketKeyObjectImages)
                if ibkt != nil {
                        if err := ibkt.ForEach(func(k, v []byte) error {
                                if v != nil {
                                        return nil
                                }

                                if !isExpiredImage(ctx, k, ibkt.Bucket(k), expThreshold) {
                                        fn(gcnode(ResourceImage, ns, string(k)))
                                }
                                return nil
                        }); err != nil {
                                return err
                        }
                }

                cbkt := nbkt.Bucket(bucketKeyObjectContent)
                if cbkt != nil {
                        ibkt := cbkt.Bucket(bucketKeyObjectIngests)
                        if ibkt != nil {
                                if err := ibkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }
                                        ea, err := readExpireAt(ibkt.Bucket(k))
                                        if err != nil {
                                                return err
                                        }
                                        if ea == nil || expThreshold.After(*ea) {
                                                return nil
                                        }
                                        fn(gcnode(ResourceIngest, ns, string(k)))
                                        return nil
                                }); err != nil {
                                        return err
                                }
                        }
                        cbkt = cbkt.Bucket(bucketKeyObjectBlob)
                        if cbkt != nil {
                                if err := cbkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }

                                        if isRootRef(cbkt.Bucket(k)) {
                                                fn(gcnode(ResourceContent, ns, string(k)))
                                        }

                                        return nil
                                }); err != nil {
                                        return err
                                }
                        }
                }

                cbkt = nbkt.Bucket(bucketKeyObjectContainers)
                if cbkt != nil {
                        if err := cbkt.ForEach(func(k, v []byte) error {
                                if v != nil {
                                        return nil
                                }

                                cibkt := cbkt.Bucket(k)
                                snapshotter := string(cibkt.Get(bucketKeySnapshotter))
                                if snapshotter != "" {
                                        ss := string(cibkt.Get(bucketKeySnapshotKey))
                                        fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, ss)))
                                }

                                return c.sendLabelRefs(ns, cibkt, fn)
                        }); err != nil {
                                return err
                        }
                }

                sbkt := nbkt.Bucket(bucketKeyObjectSnapshots)
                if sbkt != nil {
                        if err := sbkt.ForEach(func(sk, sv []byte) error {
                                if sv != nil {
                                        return nil
                                }
                                snbkt := sbkt.Bucket(sk)

                                return snbkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }
                                        if isRootRef(snbkt.Bucket(k)) {
                                                fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)))
                                        }
                                        return nil
                                })
                        }); err != nil {
                                return err
                        }
                }

                bbkt := nbkt.Bucket(bucketKeyObjectSandboxes)
                if bbkt != nil {
                        if err := bbkt.ForEach(func(k, v []byte) error {
                                if v != nil {
                                        return nil
                                }

                                sbbkt := bbkt.Bucket(k)
                                return c.sendLabelRefs(ns, sbbkt, fn)
                        }); err != nil {
                                return err
                        }
                }

                c.active(ns, fn)
        }
        return cerr
}

// references finds the resources that are reachable from the given node.
func (c *gcContext) references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node)) error {
        switch node.Type {
        case ResourceContent:
                bkt := getBucket(tx, bucketKeyVersion, []byte(node.Namespace), bucketKeyObjectContent, bucketKeyObjectBlob, []byte(node.Key))
                if bkt == nil {
                        // Node may be created from dead edge
                        return nil
                }

                return c.sendLabelRefs(node.Namespace, bkt, fn)
        case ResourceSnapshot, resourceSnapshotFlat:
                ss, name, ok := strings.Cut(node.Key, "/")
                if !ok {
                        return fmt.Errorf("invalid snapshot gc key %s", node.Key)
                }
                bkt := getBucket(tx, bucketKeyVersion, []byte(node.Namespace), bucketKeyObjectSnapshots, []byte(ss), []byte(name))
                if bkt == nil {
                        // Node may be created from dead edge
                        return nil
                }

                if pv := bkt.Get(bucketKeyParent); len(pv) > 0 {
                        fn(gcnode(node.Type, node.Namespace, fmt.Sprintf("%s/%s", ss, pv)))
                }

                // Do not send labeled references for flat snapshot refs
                if node.Type == resourceSnapshotFlat {
                        return nil
                }

                return c.sendLabelRefs(node.Namespace, bkt, fn)

        case ResourceImage, resourceImageFlat:
                bkt := getBucket(tx, bucketKeyVersion, []byte(node.Namespace), bucketKeyObjectImages, []byte(node.Key))
                if bkt == nil {
                        // Node may be created from dead edge
                        return nil
                }
                target := bkt.Bucket(bucketKeyTarget)
                if target != nil {
                        ctype := ResourceContent
                        if node.Type == resourceImageFlat {
                                // For flat leases, keep the target content only
                                ctype = resourceContentFlat
                        }
                        contentKey := string(target.Get(bucketKeyDigest))
                        fn(gcnode(ctype, node.Namespace, contentKey))
                }

                // Do not send labeled references for flat image refs
                if node.Type == resourceImageFlat {
                        return nil
                }

                return c.sendLabelRefs(node.Namespace, bkt, fn)

        case ResourceIngest:
                // Send expected value
                bkt := getBucket(tx, bucketKeyVersion, []byte(node.Namespace), bucketKeyObjectContent, bucketKeyObjectIngests, []byte(node.Key))
                if bkt == nil {
                        // Node may be created from dead edge
                        return nil
                }
                // Load expected
                expected := bkt.Get(bucketKeyExpected)
                if len(expected) > 0 {
                        fn(gcnode(ResourceContent, node.Namespace, string(expected)))
                }
                return nil
        }

        return nil
}

// scanAll finds all resources regardless whether the resources are used or not.
func (c *gcContext) scanAll(ctx context.Context, tx *bolt.Tx, fn func(ctx context.Context, n gc.Node) error) error {
        v1bkt := tx.Bucket(bucketKeyVersion)
        if v1bkt == nil {
                return nil
        }

        // iterate through each namespace
        v1c := v1bkt.Cursor()

        for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
                if v != nil {
                        continue
                }
                nbkt := v1bkt.Bucket(k)
                ns := string(k)

                lbkt := nbkt.Bucket(bucketKeyObjectLeases)
                if lbkt != nil {
                        if err := lbkt.ForEach(func(k, v []byte) error {
                                if v != nil {
                                        return nil
                                }
                                return fn(ctx, gcnode(ResourceLease, ns, string(k)))
                        }); err != nil {
                                return err
                        }
                }

                sbkt := nbkt.Bucket(bucketKeyObjectSnapshots)
                if sbkt != nil {
                        if err := sbkt.ForEach(func(sk, sv []byte) error {
                                if sv != nil {
                                        return nil
                                }
                                snbkt := sbkt.Bucket(sk)
                                return snbkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }
                                        node := gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k))
                                        return fn(ctx, node)
                                })
                        }); err != nil {
                                return err
                        }
                }

                cbkt := nbkt.Bucket(bucketKeyObjectContent)
                if cbkt != nil {
                        ibkt := cbkt.Bucket(bucketKeyObjectIngests)
                        if ibkt != nil {
                                if err := ibkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }
                                        node := gcnode(ResourceIngest, ns, string(k))
                                        return fn(ctx, node)
                                }); err != nil {
                                        return err
                                }
                        }

                        cbkt = cbkt.Bucket(bucketKeyObjectBlob)
                        if cbkt != nil {
                                if err := cbkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }
                                        node := gcnode(ResourceContent, ns, string(k))
                                        return fn(ctx, node)
                                }); err != nil {
                                        return err
                                }
                        }
                }

                ibkt := nbkt.Bucket(bucketKeyObjectImages)
                if ibkt != nil {
                        if err := ibkt.ForEach(func(k, v []byte) error {
                                if v != nil {
                                        return nil
                                }
                                node := gcnode(ResourceImage, ns, string(k))
                                return fn(ctx, node)
                        }); err != nil {
                                return err
                        }
                }
        }

        c.all(func(n gc.Node) {
                _ = fn(ctx, n)
        })

        return nil
}

// remove all buckets for the given node.
func (c *gcContext) remove(ctx context.Context, tx *bolt.Tx, node gc.Node) (interface{}, error) {
        v1bkt := tx.Bucket(bucketKeyVersion)
        if v1bkt == nil {
                return nil, nil
        }

        nsbkt := v1bkt.Bucket([]byte(node.Namespace))
        if nsbkt == nil {
                // Still remove object if refenced outside the db
                if cc, ok := c.contexts[node.Type]; ok {
                        cc.Remove(node)
                }
                return nil, nil
        }

        switch node.Type {
        case ResourceContent:
                cbkt := nsbkt.Bucket(bucketKeyObjectContent)
                if cbkt != nil {
                        cbkt = cbkt.Bucket(bucketKeyObjectBlob)
                }
                if cbkt != nil {
                        log.G(ctx).WithField("key", node.Key).Debug("remove content")
                        return nil, cbkt.DeleteBucket([]byte(node.Key))
                }
        case ResourceSnapshot:
                sbkt := nsbkt.Bucket(bucketKeyObjectSnapshots)
                if sbkt != nil {
                        ss, key, ok := strings.Cut(node.Key, "/")
                        if !ok {
                                return nil, fmt.Errorf("invalid snapshot gc key %s", node.Key)
                        }
                        ssbkt := sbkt.Bucket([]byte(ss))
                        if ssbkt != nil {
                                log.G(ctx).WithField("key", key).WithField("snapshotter", ss).Debug("remove snapshot")
                                return &eventstypes.SnapshotRemove{
                                        Key:         key,
                                        Snapshotter: ss,
                                }, ssbkt.DeleteBucket([]byte(key))
                        }
                }
        case ResourceImage:
                ibkt := nsbkt.Bucket(bucketKeyObjectImages)
                if ibkt != nil {
                        return &eventstypes.ImageDelete{
                                Name: node.Key,
                        }, ibkt.DeleteBucket([]byte(node.Key))
                }
        case ResourceLease:
                lbkt := nsbkt.Bucket(bucketKeyObjectLeases)
                if lbkt != nil {
                        return nil, lbkt.DeleteBucket([]byte(node.Key))
                }
        case ResourceIngest:
                ibkt := nsbkt.Bucket(bucketKeyObjectContent)
                if ibkt != nil {
                        ibkt = ibkt.Bucket(bucketKeyObjectIngests)
                }
                if ibkt != nil {
                        log.G(ctx).WithField("ref", node.Key).Debug("remove ingest")
                        return nil, ibkt.DeleteBucket([]byte(node.Key))
                }
        default:
                cc, ok := c.contexts[node.Type]
                if ok {
                        cc.Remove(node)
                } else {
                        log.G(ctx).WithField("ref", node.Key).WithField("type", node.Type).Info("no remove defined for resource")
                }
        }

        return nil, nil
}

// sendLabelRefs sends all snapshot and content references referred to by the labels in the bkt
func (c *gcContext) sendLabelRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error {
        lbkt := bkt.Bucket(bucketKeyObjectLabels)
        if lbkt != nil {
                lc := lbkt.Cursor()
                for i := range c.labelHandlers {
                        labelRef := string(c.labelHandlers[i].key)
                        for k, v := lc.Seek(c.labelHandlers[i].key); k != nil && strings.HasPrefix(string(k), labelRef); k, v = lc.Next() {
                                c.labelHandlers[i].fn(ns, k, v, fn)
                        }
                }
        }
        return nil
}

func isRootRef(bkt *bolt.Bucket) bool {
        lbkt := bkt.Bucket(bucketKeyObjectLabels)
        if lbkt != nil {
                rv := lbkt.Get(labelGCRoot)
                if rv != nil {
                        // TODO: interpret rv as a timestamp and skip if expired
                        return true
                }
        }
        return false
}

func isExpiredImage(ctx context.Context, k []byte, bkt *bolt.Bucket, expTheshold time.Time) bool {
        lbkt := bkt.Bucket(bucketKeyObjectLabels)
        if lbkt != nil {
                el := lbkt.Get(labelGCExpire)
                if el != nil {
                        exp, err := time.Parse(time.RFC3339, string(el))
                        if err != nil {
                                log.G(ctx).WithError(err).WithField("image", string(k)).Infof("ignoring invalid expiration value %q", string(el))
                                return false
                        }
                        return expTheshold.After(exp)
                }
        }
        return false
}

func gcnode(t gc.ResourceType, ns, key string) gc.Node {
        return gc.Node{
                Type:      t,
                Namespace: ns,
                Key:       key,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "encoding/binary"
        "errors"
        "fmt"
        "strings"
        "sync/atomic"
        "time"

        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        "github.com/containerd/containerd/v2/pkg/epoch"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        bolt "go.etcd.io/bbolt"
)

type imageStore struct {
        db *DB
}

// NewImageStore returns a store backed by a bolt DB
func NewImageStore(db *DB) images.Store {
        return &imageStore{db: db}
}

func (s *imageStore) Get(ctx context.Context, name string) (images.Image, error) {
        var image images.Image

        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return images.Image{}, err
        }

        if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getImagesBucket(tx, namespace)
                if bkt == nil {
                        return fmt.Errorf("image %q: %w", name, errdefs.ErrNotFound)
                }

                ibkt := bkt.Bucket([]byte(name))
                if ibkt == nil {
                        return fmt.Errorf("image %q: %w", name, errdefs.ErrNotFound)
                }

                image.Name = name
                if err := readImage(&image, ibkt); err != nil {
                        return fmt.Errorf("image %q: %w", name, err)
                }

                return nil
        }); err != nil {
                return images.Image{}, err
        }

        return image, nil
}

func (s *imageStore) List(ctx context.Context, fs ...string) ([]images.Image, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return nil, fmt.Errorf("%s: %w", err.Error(), errdefs.ErrInvalidArgument)
        }

        var m []images.Image
        if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getImagesBucket(tx, namespace)
                if bkt == nil {
                        return nil // empty store
                }

                return bkt.ForEach(func(k, v []byte) error {
                        var (
                                image = images.Image{
                                        Name: string(k),
                                }
                                kbkt = bkt.Bucket(k)
                        )

                        if err := readImage(&image, kbkt); err != nil {
                                return err
                        }

                        if filter.Match(adaptImage(image)) {
                                m = append(m, image)
                        }
                        return nil
                })
        }); err != nil {
                return nil, err
        }

        return m, nil
}

func (s *imageStore) Create(ctx context.Context, image images.Image) (images.Image, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return images.Image{}, err
        }

        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                if err := validateImage(&image); err != nil {
                        return err
                }

                bkt, err := createImagesBucket(tx, namespace)
                if err != nil {
                        return err
                }

                if err := addImageLease(ctx, tx, image.Name, image.Labels); err != nil {
                        return err
                }

                ibkt, err := bkt.CreateBucket([]byte(image.Name))
                if err != nil {
                        if err != bolt.ErrBucketExists {
                                return err
                        }

                        return fmt.Errorf("image %q: %w", image.Name, errdefs.ErrAlreadyExists)
                }

                // The value of `image.CreatedAt` passed from the caller is discarded here.
                // Ideally we should return an error when the value is already set.
                // However, as `image.CreatedAt` is defined as a non-pointer `time.Time`, we can't compare it to nil.
                // And we can't compare it to `time.Time{}` either, as `time.Time{}` is a proper timestamp (1970-01-01 00:00:00).
                if tm := epoch.FromContext(ctx); tm != nil {
                        image.CreatedAt = tm.UTC()
                } else {
                        image.CreatedAt = time.Now().UTC()
                }
                image.UpdatedAt = image.CreatedAt
                return writeImage(ibkt, &image)
        }); err != nil {
                return images.Image{}, err
        }

        if publisher := s.db.Publisher(ctx); publisher != nil {
                if err := publisher.Publish(ctx, "/images/create", &eventstypes.ImageCreate{
                        Name:   image.Name,
                        Labels: image.Labels,
                }); err != nil {
                        return images.Image{}, err
                }
        }

        return image, nil
}

func (s *imageStore) Update(ctx context.Context, image images.Image, fieldpaths ...string) (images.Image, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return images.Image{}, err
        }

        if image.Name == "" {
                return images.Image{}, fmt.Errorf("image name is required for update: %w", errdefs.ErrInvalidArgument)
        }

        var updated images.Image

        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt, err := createImagesBucket(tx, namespace)
                if err != nil {
                        return err
                }

                ibkt := bkt.Bucket([]byte(image.Name))
                if ibkt == nil {
                        return fmt.Errorf("image %q: %w", image.Name, errdefs.ErrNotFound)
                }

                if err := readImage(&updated, ibkt); err != nil {
                        return fmt.Errorf("image %q: %w", image.Name, err)
                }
                createdat := updated.CreatedAt
                updated.Name = image.Name

                if len(fieldpaths) > 0 {
                        for _, path := range fieldpaths {
                                if strings.HasPrefix(path, "labels.") {
                                        if updated.Labels == nil {
                                                updated.Labels = map[string]string{}
                                        }

                                        key := strings.TrimPrefix(path, "labels.")
                                        updated.Labels[key] = image.Labels[key]
                                        continue
                                } else if strings.HasPrefix(path, "annotations.") {
                                        if updated.Target.Annotations == nil {
                                                updated.Target.Annotations = map[string]string{}
                                        }

                                        key := strings.TrimPrefix(path, "annotations.")
                                        updated.Target.Annotations[key] = image.Target.Annotations[key]
                                        continue
                                }

                                switch path {
                                case "labels":
                                        updated.Labels = image.Labels
                                case "target":
                                        // NOTE(stevvooe): While we allow setting individual labels, we
                                        // only support replacing the target as a unit, since that is
                                        // commonly pulled as a unit from other sources. It often doesn't
                                        // make sense to modify the size or digest without touching the
                                        // mediatype, as well, for example.
                                        updated.Target = image.Target
                                case "annotations":
                                        updated.Target.Annotations = image.Target.Annotations
                                default:
                                        return fmt.Errorf("cannot update %q field on image %q: %w", path, image.Name, errdefs.ErrInvalidArgument)
                                }
                        }
                } else {
                        updated = image
                }

                if err := validateImage(&updated); err != nil {
                        return err
                }

                // Collectible label may be added, if so add to lease
                if err := addImageLease(ctx, tx, updated.Name, updated.Labels); err != nil {
                        return err
                }

                updated.CreatedAt = createdat
                if tm := epoch.FromContext(ctx); tm != nil {
                        updated.UpdatedAt = tm.UTC()
                } else {
                        updated.UpdatedAt = time.Now().UTC()
                }
                return writeImage(ibkt, &updated)
        }); err != nil {
                return images.Image{}, err
        }

        if publisher := s.db.Publisher(ctx); publisher != nil {
                if err := publisher.Publish(ctx, "/images/update", &eventstypes.ImageUpdate{
                        Name:   updated.Name,
                        Labels: updated.Labels,
                }); err != nil {
                        return images.Image{}, err
                }
        }

        return updated, nil

}

func (s *imageStore) Delete(ctx context.Context, name string, opts ...images.DeleteOpt) error {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        var options images.DeleteOptions
        for _, opt := range opts {
                if err := opt(ctx, &options); err != nil {
                        return err
                }
        }

        err = update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getImagesBucket(tx, namespace)
                if bkt == nil {
                        return fmt.Errorf("image %q: %w", name, errdefs.ErrNotFound)
                }

                if err := removeImageLease(ctx, tx, name); err != nil {
                        return err
                }

                if options.Target != nil && options.Target.Digest != "" {
                        ibkt := bkt.Bucket([]byte(name))
                        if ibkt == nil {
                                return fmt.Errorf("image %q: %w", name, errdefs.ErrNotFound)
                        }

                        var check images.Image
                        if err := readImage(&check, ibkt); err != nil {
                                return fmt.Errorf("image %q: %w", name, err)
                        }

                        if check.Target.Digest != options.Target.Digest {
                                return fmt.Errorf("image %q has target %v, not %v: %w", name, check.Target.Digest, options.Target.Digest, errdefs.ErrNotFound)
                        }
                }

                if err = bkt.DeleteBucket([]byte(name)); err != nil {
                        if err == bolt.ErrBucketNotFound {
                                err = fmt.Errorf("image %q: %w", name, errdefs.ErrNotFound)
                        }
                        return err
                }

                atomic.AddUint32(&s.db.dirty, 1)

                return nil
        })
        if err != nil {
                return err
        }

        if publisher := s.db.Publisher(ctx); publisher != nil {
                if err := publisher.Publish(ctx, "/images/delete", &eventstypes.ImageDelete{
                        Name: name,
                }); err != nil {
                        return err
                }
        }

        return nil
}

func validateImage(image *images.Image) error {
        if image.Name == "" {
                return fmt.Errorf("image name must not be empty: %w", errdefs.ErrInvalidArgument)
        }

        for k, v := range image.Labels {
                if err := labels.Validate(k, v); err != nil {
                        return fmt.Errorf("image.Labels: %w", err)
                }
        }

        return validateTarget(&image.Target)
}

func validateTarget(target *ocispec.Descriptor) error {
        // NOTE(stevvooe): Only validate fields we actually store.

        if err := target.Digest.Validate(); err != nil {
                return fmt.Errorf("Target.Digest %q invalid: %v: %w", target.Digest, err, errdefs.ErrInvalidArgument)
        }

        if target.Size <= 0 {
                return fmt.Errorf("Target.Size must be greater than zero: %w", errdefs.ErrInvalidArgument)
        }

        if target.MediaType == "" {
                return fmt.Errorf("Target.MediaType must be set: %w", errdefs.ErrInvalidArgument)
        }

        return nil
}

func readImage(image *images.Image, bkt *bolt.Bucket) error {
        if err := boltutil.ReadTimestamps(bkt, &image.CreatedAt, &image.UpdatedAt); err != nil {
                return err
        }

        labels, err := boltutil.ReadLabels(bkt)
        if err != nil {
                return err
        }
        image.Labels = labels

        image.Target.Annotations, err = boltutil.ReadAnnotations(bkt)
        if err != nil {
                return err
        }

        tbkt := bkt.Bucket(bucketKeyTarget)
        if tbkt == nil {
                return errors.New("unable to read target bucket")
        }
        return tbkt.ForEach(func(k, v []byte) error {
                if v == nil {
                        return nil // skip it? a bkt maybe?
                }

                // TODO(stevvooe): This is why we need to use byte values for
                // keys, rather than full arrays.
                switch string(k) {
                case string(bucketKeyDigest):
                        image.Target.Digest = digest.Digest(v)
                case string(bucketKeyMediaType):
                        image.Target.MediaType = string(v)
                case string(bucketKeySize):
                        image.Target.Size, _ = binary.Varint(v)
                }

                return nil
        })
}

func writeImage(bkt *bolt.Bucket, image *images.Image) error {
        if err := boltutil.WriteTimestamps(bkt, image.CreatedAt, image.UpdatedAt); err != nil {
                return err
        }

        if err := boltutil.WriteLabels(bkt, image.Labels); err != nil {
                return fmt.Errorf("writing labels for image %v: %w", image.Name, err)
        }

        if err := boltutil.WriteAnnotations(bkt, image.Target.Annotations); err != nil {
                return fmt.Errorf("writing Annotations for image %v: %w", image.Name, err)
        }

        // write the target bucket
        tbkt, err := bkt.CreateBucketIfNotExists(bucketKeyTarget)
        if err != nil {
                return err
        }

        sizeEncoded, err := encodeInt(image.Target.Size)
        if err != nil {
                return err
        }

        for _, v := range [][2][]byte{
                {bucketKeyDigest, []byte(image.Target.Digest)},
                {bucketKeyMediaType, []byte(image.Target.MediaType)},
                {bucketKeySize, sizeEncoded},
        } {
                if err := tbkt.Put(v[0], v[1]); err != nil {
                        return err
                }
        }

        return nil
}

func encodeInt(i int64) ([]byte, error) {
        var (
                buf      [binary.MaxVarintLen64]byte
                iEncoded = buf[:]
        )
        iEncoded = iEncoded[:binary.PutVarint(iEncoded, i)]

        if len(iEncoded) == 0 {
                return nil, fmt.Errorf("failed encoding integer = %v", i)
        }
        return iEncoded, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "errors"
        "fmt"
        "strings"
        "sync/atomic"
        "time"

        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        digest "github.com/opencontainers/go-digest"
        bolt "go.etcd.io/bbolt"
)

// leaseManager manages the create/delete lifecycle of leases
// and also returns existing leases
type leaseManager struct {
        db *DB
}

// NewLeaseManager creates a new lease manager for managing leases using
// the provided database transaction.
func NewLeaseManager(db *DB) leases.Manager {
        return &leaseManager{
                db: db,
        }
}

// Create creates a new lease using the provided lease
func (lm *leaseManager) Create(ctx context.Context, opts ...leases.Opt) (leases.Lease, error) {
        var l leases.Lease
        for _, opt := range opts {
                if err := opt(&l); err != nil {
                        return leases.Lease{}, err
                }
        }
        if l.ID == "" {
                return leases.Lease{}, errors.New("lease id must be provided")
        }

        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return leases.Lease{}, err
        }

        if err := update(ctx, lm.db, func(tx *bolt.Tx) error {
                topbkt, err := createBucketIfNotExists(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases)
                if err != nil {
                        return err
                }

                txbkt, err := topbkt.CreateBucket([]byte(l.ID))
                if err != nil {
                        if err == bolt.ErrBucketExists {
                                err = errdefs.ErrAlreadyExists
                        }
                        return fmt.Errorf("lease %q: %w", l.ID, err)
                }

                t := time.Now().UTC()
                createdAt, err := t.MarshalBinary()
                if err != nil {
                        return err
                }
                if err := txbkt.Put(bucketKeyCreatedAt, createdAt); err != nil {
                        return err
                }

                if l.Labels != nil {
                        if err := boltutil.WriteLabels(txbkt, l.Labels); err != nil {
                                return err
                        }
                }
                l.CreatedAt = t

                return nil
        }); err != nil {
                return leases.Lease{}, err
        }
        return l, nil
}

// Delete deletes the lease with the provided lease ID
func (lm *leaseManager) Delete(ctx context.Context, lease leases.Lease, _ ...leases.DeleteOpt) error {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        return update(ctx, lm.db, func(tx *bolt.Tx) error {
                topbkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases)
                if topbkt == nil {
                        return fmt.Errorf("lease %q: %w", lease.ID, errdefs.ErrNotFound)
                }
                if err := topbkt.DeleteBucket([]byte(lease.ID)); err != nil {
                        if err == bolt.ErrBucketNotFound {
                                err = fmt.Errorf("lease %q: %w", lease.ID, errdefs.ErrNotFound)
                        }
                        return err
                }

                atomic.AddUint32(&lm.db.dirty, 1)

                return nil
        })
}

// List lists all active leases
func (lm *leaseManager) List(ctx context.Context, fs ...string) ([]leases.Lease, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return nil, fmt.Errorf("%s: %w", err.Error(), errdefs.ErrInvalidArgument)
        }

        var ll []leases.Lease

        if err := view(ctx, lm.db, func(tx *bolt.Tx) error {
                topbkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases)
                if topbkt == nil {
                        return nil
                }

                return topbkt.ForEach(func(k, v []byte) error {
                        if v != nil {
                                return nil
                        }
                        txbkt := topbkt.Bucket(k)

                        l := leases.Lease{
                                ID: string(k),
                        }

                        if v := txbkt.Get(bucketKeyCreatedAt); v != nil {
                                t := &l.CreatedAt
                                if err := t.UnmarshalBinary(v); err != nil {
                                        return err
                                }
                        }

                        labels, err := boltutil.ReadLabels(txbkt)
                        if err != nil {
                                return err
                        }
                        l.Labels = labels

                        if filter.Match(adaptLease(l)) {
                                ll = append(ll, l)
                        }

                        return nil
                })
        }); err != nil {
                return nil, err
        }

        return ll, nil
}

// AddResource references the resource by the provided lease.
func (lm *leaseManager) AddResource(ctx context.Context, lease leases.Lease, r leases.Resource) error {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        return update(ctx, lm.db, func(tx *bolt.Tx) error {
                topbkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lease.ID))
                if topbkt == nil {
                        return fmt.Errorf("lease %q: %w", lease.ID, errdefs.ErrNotFound)
                }

                keys, ref, err := parseLeaseResource(r)
                if err != nil {
                        return err
                }

                bkt := topbkt
                for _, key := range keys {
                        bkt, err = bkt.CreateBucketIfNotExists([]byte(key))
                        if err != nil {
                                return err
                        }
                }
                return bkt.Put([]byte(ref), nil)
        })
}

// DeleteResource dereferences the resource by the provided lease.
func (lm *leaseManager) DeleteResource(ctx context.Context, lease leases.Lease, r leases.Resource) error {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        return update(ctx, lm.db, func(tx *bolt.Tx) error {
                topbkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lease.ID))
                if topbkt == nil {
                        return fmt.Errorf("lease %q: %w", lease.ID, errdefs.ErrNotFound)
                }

                keys, ref, err := parseLeaseResource(r)
                if err != nil {
                        return err
                }

                bkt := topbkt
                for _, key := range keys {
                        if bkt == nil {
                                break
                        }
                        bkt = bkt.Bucket([]byte(key))
                }

                if bkt != nil {
                        if err := bkt.Delete([]byte(ref)); err != nil {
                                return err
                        }
                }

                atomic.AddUint32(&lm.db.dirty, 1)

                return nil
        })
}

// ListResources lists all the resources referenced by the lease.
func (lm *leaseManager) ListResources(ctx context.Context, lease leases.Lease) ([]leases.Resource, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        var rs []leases.Resource

        if err := view(ctx, lm.db, func(tx *bolt.Tx) error {

                topbkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lease.ID))
                if topbkt == nil {
                        return fmt.Errorf("lease %q: %w", lease.ID, errdefs.ErrNotFound)
                }

                // content resources
                if cbkt := topbkt.Bucket(bucketKeyObjectContent); cbkt != nil {
                        if err := cbkt.ForEach(func(k, _ []byte) error {
                                rs = append(rs, leases.Resource{
                                        ID:   string(k),
                                        Type: string(bucketKeyObjectContent),
                                })

                                return nil
                        }); err != nil {
                                return err
                        }
                }

                // images resources
                if ibkt := topbkt.Bucket(bucketKeyObjectImages); ibkt != nil {
                        if err := ibkt.ForEach(func(k, _ []byte) error {
                                rs = append(rs, leases.Resource{
                                        ID:   string(k),
                                        Type: string(bucketKeyObjectImages),
                                })

                                return nil
                        }); err != nil {
                                return err
                        }
                }

                // ingest resources
                if lbkt := topbkt.Bucket(bucketKeyObjectIngests); lbkt != nil {
                        if err := lbkt.ForEach(func(k, _ []byte) error {
                                rs = append(rs, leases.Resource{
                                        ID:   string(k),
                                        Type: string(bucketKeyObjectIngests),
                                })

                                return nil
                        }); err != nil {
                                return err
                        }
                }

                // snapshot resources
                if sbkt := topbkt.Bucket(bucketKeyObjectSnapshots); sbkt != nil {
                        if err := sbkt.ForEach(func(sk, sv []byte) error {
                                if sv != nil {
                                        return nil
                                }

                                snbkt := sbkt.Bucket(sk)
                                return snbkt.ForEach(func(k, _ []byte) error {
                                        rs = append(rs, leases.Resource{
                                                ID:   string(k),
                                                Type: fmt.Sprintf("%s/%s", bucketKeyObjectSnapshots, sk),
                                        })
                                        return nil
                                })
                        }); err != nil {
                                return err
                        }
                }

                return nil
        }); err != nil {
                return nil, err
        }
        return rs, nil
}

func addSnapshotLease(ctx context.Context, tx *bolt.Tx, snapshotter, key string) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be checked")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid))
        if bkt == nil {
                return fmt.Errorf("lease does not exist: %w", errdefs.ErrNotFound)
        }

        bkt, err := bkt.CreateBucketIfNotExists(bucketKeyObjectSnapshots)
        if err != nil {
                return err
        }

        bkt, err = bkt.CreateBucketIfNotExists([]byte(snapshotter))
        if err != nil {
                return err
        }

        return bkt.Put([]byte(key), nil)
}

func removeSnapshotLease(ctx context.Context, tx *bolt.Tx, snapshotter, key string) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be checked")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid), bucketKeyObjectSnapshots, []byte(snapshotter))
        if bkt == nil {
                // Key does not exist so we return nil
                return nil
        }

        return bkt.Delete([]byte(key))
}

func addContentLease(ctx context.Context, tx *bolt.Tx, dgst digest.Digest) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be required")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid))
        if bkt == nil {
                return fmt.Errorf("lease does not exist: %w", errdefs.ErrNotFound)
        }

        bkt, err := bkt.CreateBucketIfNotExists(bucketKeyObjectContent)
        if err != nil {
                return err
        }

        return bkt.Put([]byte(dgst.String()), nil)
}

func removeContentLease(ctx context.Context, tx *bolt.Tx, dgst digest.Digest) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be checked")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid), bucketKeyObjectContent)
        if bkt == nil {
                // Key does not exist so we return nil
                return nil
        }

        return bkt.Delete([]byte(dgst.String()))
}

func addIngestLease(ctx context.Context, tx *bolt.Tx, ref string) (bool, error) {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return false, nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be required")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid))
        if bkt == nil {
                return false, fmt.Errorf("lease does not exist: %w", errdefs.ErrNotFound)
        }

        bkt, err := bkt.CreateBucketIfNotExists(bucketKeyObjectIngests)
        if err != nil {
                return false, err
        }

        if err := bkt.Put([]byte(ref), nil); err != nil {
                return false, err
        }

        return true, nil
}

func removeIngestLease(ctx context.Context, tx *bolt.Tx, ref string) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be checked")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid), bucketKeyObjectIngests)
        if bkt == nil {
                // Key does not exist so we return nil
                return nil
        }

        return bkt.Delete([]byte(ref))
}

func addImageLease(ctx context.Context, tx *bolt.Tx, ref string, labels map[string]string) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        // If image doesn't have expiration, it does not need to be leased
        if _, ok := labels[string(labelGCExpire)]; !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be required")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid))
        if bkt == nil {
                return fmt.Errorf("lease does not exist: %w", errdefs.ErrNotFound)
        }

        bkt, err := bkt.CreateBucketIfNotExists(bucketKeyObjectImages)
        if err != nil {
                return err
        }

        if err := bkt.Put([]byte(ref), nil); err != nil {
                return err
        }

        return nil
}

func removeImageLease(ctx context.Context, tx *bolt.Tx, ref string) error {
        lid, ok := leases.FromContext(ctx)
        if !ok {
                return nil
        }

        namespace, ok := namespaces.Namespace(ctx)
        if !ok {
                panic("namespace must already be checked")
        }

        bkt := getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectLeases, []byte(lid), bucketKeyObjectImages)
        if bkt == nil {
                // Key does not exist so we return nil
                return nil
        }

        return bkt.Delete([]byte(ref))
}

func parseLeaseResource(r leases.Resource) ([]string, string, error) {
        var (
                ref  = r.ID
                typ  = r.Type
                keys = strings.Split(typ, "/")
        )

        switch k := keys[0]; k {
        case string(bucketKeyObjectContent),
                string(bucketKeyObjectIngests),
                string(bucketKeyObjectImages):

                if len(keys) != 1 {
                        return nil, "", fmt.Errorf("invalid resource type %s: %w", typ, errdefs.ErrInvalidArgument)
                }

                if k == string(bucketKeyObjectContent) {
                        dgst, err := digest.Parse(ref)
                        if err != nil {
                                return nil, "", fmt.Errorf("invalid content resource id %s: %v: %w", ref, err, errdefs.ErrInvalidArgument)
                        }
                        ref = dgst.String()
                }
        case string(bucketKeyObjectSnapshots):
                if len(keys) != 2 {
                        return nil, "", fmt.Errorf("invalid snapshot resource type %s: %w", typ, errdefs.ErrInvalidArgument)
                }
        default:
                return nil, "", fmt.Errorf("resource type %s not supported yet: %w", typ, errdefs.ErrNotImplemented)
        }

        return keys, ref, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import bolt "go.etcd.io/bbolt"

type migration struct {
        schema  string
        version int
        migrate func(*bolt.Tx) error
}

// migrations stores the list of database migrations
// for each update to the database schema. The migrations
// array MUST be ordered by version from least to greatest.
// The last entry in the array should correspond to the
// schemaVersion and dbVersion constants.
// A migration test MUST be added for each migration in
// the array.
// The migrate function can safely assume the version
// of the data it is migrating from is the previous version
// of the database.
var migrations = []migration{
        {
                schema:  "v1",
                version: 1,
                migrate: addChildLinks,
        },
        {
                schema:  "v1",
                version: 2,
                migrate: migrateIngests,
        },
        {
                schema:  "v1",
                version: 3,
                migrate: noOpMigration,
        },
}

// addChildLinks Adds children key to the snapshotters to enforce snapshot
// entries cannot be removed which have children
func addChildLinks(tx *bolt.Tx) error {
        v1bkt := tx.Bucket(bucketKeyVersion)
        if v1bkt == nil {
                return nil
        }

        // iterate through each namespace
        v1c := v1bkt.Cursor()

        for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
                if v != nil {
                        continue
                }
                nbkt := v1bkt.Bucket(k)

                sbkt := nbkt.Bucket(bucketKeyObjectSnapshots)
                if sbkt != nil {
                        // Iterate through each snapshotter
                        if err := sbkt.ForEach(func(sk, sv []byte) error {
                                if sv != nil {
                                        return nil
                                }
                                snbkt := sbkt.Bucket(sk)

                                // Iterate through each snapshot
                                return snbkt.ForEach(func(k, v []byte) error {
                                        if v != nil {
                                                return nil
                                        }
                                        parent := snbkt.Bucket(k).Get(bucketKeyParent)
                                        if len(parent) > 0 {
                                                pbkt := snbkt.Bucket(parent)
                                                if pbkt == nil {
                                                        // Not enforcing consistency during migration, skip
                                                        return nil
                                                }
                                                cbkt, err := pbkt.CreateBucketIfNotExists(bucketKeyChildren)
                                                if err != nil {
                                                        return err
                                                }
                                                if err := cbkt.Put(k, nil); err != nil {
                                                        return err
                                                }
                                        }

                                        return nil
                                })
                        }); err != nil {
                                return err
                        }
                }
        }

        return nil
}

// migrateIngests moves ingests from the key/value ingest bucket
// to a structured ingest bucket for storing additional state about
// an ingest.
func migrateIngests(tx *bolt.Tx) error {
        v1bkt := tx.Bucket(bucketKeyVersion)
        if v1bkt == nil {
                return nil
        }

        // iterate through each namespace
        v1c := v1bkt.Cursor()

        for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
                if v != nil {
                        continue
                }
                bkt := v1bkt.Bucket(k).Bucket(bucketKeyObjectContent)
                if bkt == nil {
                        continue
                }

                dbkt := bkt.Bucket(deprecatedBucketKeyObjectIngest)
                if dbkt == nil {
                        continue
                }

                // Create new ingests bucket
                nbkt, err := bkt.CreateBucketIfNotExists(bucketKeyObjectIngests)
                if err != nil {
                        return err
                }

                if err := dbkt.ForEach(func(ref, bref []byte) error {
                        ibkt, err := nbkt.CreateBucketIfNotExists(ref)
                        if err != nil {
                                return err
                        }
                        return ibkt.Put(bucketKeyRef, bref)
                }); err != nil {
                        return err
                }

                if err := bkt.DeleteBucket(deprecatedBucketKeyObjectIngest); err != nil {
                        return err
                }
        }

        return nil
}

// noOpMigration was for a database change from boltdb/bolt which is no
// longer being supported, to go.etcd.io/bbolt which is the currently
// maintained repo for boltdb.
func noOpMigration(tx *bolt.Tx) error {
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "fmt"
        "strings"

        "github.com/containerd/containerd/v2/pkg/identifiers"
        l "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        bolt "go.etcd.io/bbolt"
)

type namespaceStore struct {
        tx *bolt.Tx
}

// NewNamespaceStore returns a store backed by a bolt DB
func NewNamespaceStore(tx *bolt.Tx) namespaces.Store {
        return &namespaceStore{tx: tx}
}

func (s *namespaceStore) Create(ctx context.Context, namespace string, labels map[string]string) error {
        topbkt, err := createBucketIfNotExists(s.tx, bucketKeyVersion)
        if err != nil {
                return err
        }

        if err := identifiers.Validate(namespace); err != nil {
                return err
        }

        for k, v := range labels {
                if err := l.Validate(k, v); err != nil {
                        return fmt.Errorf("namespace.Labels: %w", err)
                }
        }

        // provides the already exists error.
        bkt, err := topbkt.CreateBucket([]byte(namespace))
        if err != nil {
                if err == bolt.ErrBucketExists {
                        return fmt.Errorf("namespace %q: %w", namespace, errdefs.ErrAlreadyExists)
                }

                return err
        }

        lbkt, err := bkt.CreateBucketIfNotExists(bucketKeyObjectLabels)
        if err != nil {
                return err
        }

        for k, v := range labels {
                if err := lbkt.Put([]byte(k), []byte(v)); err != nil {
                        return err
                }
        }

        return nil
}

func (s *namespaceStore) Labels(ctx context.Context, namespace string) (map[string]string, error) {
        labels := map[string]string{}

        bkt := getNamespaceLabelsBucket(s.tx, namespace)
        if bkt == nil {
                return labels, nil
        }

        if err := bkt.ForEach(func(k, v []byte) error {
                labels[string(k)] = string(v)
                return nil
        }); err != nil {
                return nil, err
        }

        return labels, nil
}

func (s *namespaceStore) SetLabel(ctx context.Context, namespace, key, value string) error {
        if err := l.Validate(key, value); err != nil {
                return fmt.Errorf("namespace.Labels: %w", err)
        }

        return withNamespacesLabelsBucket(s.tx, namespace, func(bkt *bolt.Bucket) error {
                if value == "" {
                        return bkt.Delete([]byte(key))
                }

                return bkt.Put([]byte(key), []byte(value))
        })

}

func (s *namespaceStore) List(ctx context.Context) ([]string, error) {
        bkt := getBucket(s.tx, bucketKeyVersion)
        if bkt == nil {
                return nil, nil // no namespaces!
        }

        var namespaces []string
        if err := bkt.ForEach(func(k, v []byte) error {
                if v != nil {
                        return nil // not a bucket
                }

                namespaces = append(namespaces, string(k))
                return nil
        }); err != nil {
                return nil, err
        }

        return namespaces, nil
}

func (s *namespaceStore) Delete(ctx context.Context, namespace string, opts ...namespaces.DeleteOpts) error {
        i := &namespaces.DeleteInfo{
                Name: namespace,
        }
        for _, o := range opts {
                if err := o(ctx, i); err != nil {
                        return err
                }
        }
        bkt := getBucket(s.tx, bucketKeyVersion)
        types, err := s.listNs(namespace)
        if err != nil {
                return err
        }

        if len(types) > 0 {
                return fmt.Errorf(
                        "namespace %q must be empty, but it still has %s: %w",
                        namespace, strings.Join(types, ", "),
                        errdefs.ErrFailedPrecondition,
                )
        }

        if err := bkt.DeleteBucket([]byte(namespace)); err != nil {
                if err == bolt.ErrBucketNotFound {
                        return fmt.Errorf("namespace %q: %w", namespace, errdefs.ErrNotFound)
                }

                return err
        }

        return nil
}

// listNs returns the types of the remaining objects inside the given namespace.
// It doesn't return exact objects due to performance concerns.
func (s *namespaceStore) listNs(namespace string) ([]string, error) {
        var out []string

        if !isBucketEmpty(getImagesBucket(s.tx, namespace)) {
                out = append(out, "images")
        }
        if !isBucketEmpty(getBlobsBucket(s.tx, namespace)) {
                out = append(out, "blobs")
        }
        if !isBucketEmpty(getContainersBucket(s.tx, namespace)) {
                out = append(out, "containers")
        }

        if snbkt := getSnapshottersBucket(s.tx, namespace); snbkt != nil {
                if err := snbkt.ForEach(func(k, v []byte) error {
                        if v == nil {
                                if !isBucketEmpty(snbkt.Bucket(k)) {
                                        out = append(out, fmt.Sprintf("snapshots on %q snapshotter", k))
                                }
                        }
                        return nil
                }); err != nil {
                        return nil, err
                }
        }

        return out, nil
}

func isBucketEmpty(bkt *bolt.Bucket) bool {
        if bkt == nil {
                return true
        }

        k, _ := bkt.Cursor().First()
        return k == nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "errors"
        "fmt"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        api "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/identifiers"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
        "go.etcd.io/bbolt"
)

type sandboxStore struct {
        db *DB
}

var _ api.Store = (*sandboxStore)(nil)

// NewSandboxStore creates a datababase client for sandboxes
func NewSandboxStore(db *DB) api.Store {
        return &sandboxStore{db: db}
}

// Create a sandbox record in the store
func (s *sandboxStore) Create(ctx context.Context, sandbox api.Sandbox) (api.Sandbox, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return api.Sandbox{}, err
        }

        sandbox.CreatedAt = time.Now().UTC()
        sandbox.UpdatedAt = sandbox.CreatedAt

        if err := s.validate(&sandbox); err != nil {
                return api.Sandbox{}, fmt.Errorf("failed to validate sandbox: %w", err)
        }

        if err := update(ctx, s.db, func(tx *bbolt.Tx) error {
                parent, err := createSandboxBucket(tx, ns)
                if err != nil {
                        return fmt.Errorf("create error: %w", err)
                }

                if err := s.write(parent, &sandbox, false); err != nil {
                        return fmt.Errorf("write error: %w", err)
                }

                return nil
        }); err != nil {
                return api.Sandbox{}, err
        }

        return sandbox, nil
}

// Update the sandbox with the provided sandbox object and fields
func (s *sandboxStore) Update(ctx context.Context, sandbox api.Sandbox, fieldpaths ...string) (api.Sandbox, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return api.Sandbox{}, err
        }

        ret := api.Sandbox{}
        if err := update(ctx, s.db, func(tx *bbolt.Tx) error {
                parent := getSandboxBucket(tx, ns)
                if parent == nil {
                        return fmt.Errorf("no sandbox buckets: %w", errdefs.ErrNotFound)
                }

                updated, err := s.read(parent, []byte(sandbox.ID))
                if err != nil {
                        return err
                }

                if len(fieldpaths) == 0 {
                        fieldpaths = []string{"labels", "extensions", "spec", "runtime"}

                        if updated.Runtime.Name != sandbox.Runtime.Name {
                                return fmt.Errorf("sandbox.Runtime.Name field is immutable: %w", errdefs.ErrInvalidArgument)
                        }
                }

                for _, path := range fieldpaths {
                        if strings.HasPrefix(path, "labels.") {
                                if updated.Labels == nil {
                                        updated.Labels = map[string]string{}
                                }

                                key := strings.TrimPrefix(path, "labels.")
                                updated.Labels[key] = sandbox.Labels[key]
                                continue
                        } else if strings.HasPrefix(path, "extensions.") {
                                if updated.Extensions == nil {
                                        updated.Extensions = map[string]typeurl.Any{}
                                }

                                key := strings.TrimPrefix(path, "extensions.")
                                updated.Extensions[key] = sandbox.Extensions[key]
                                continue
                        }

                        switch path {
                        case "labels":
                                updated.Labels = sandbox.Labels
                        case "extensions":
                                updated.Extensions = sandbox.Extensions
                        case "runtime":
                                updated.Runtime = sandbox.Runtime
                        case "spec":
                                updated.Spec = sandbox.Spec
                        default:
                                return fmt.Errorf("cannot update %q field on sandbox %q: %w", path, sandbox.ID, errdefs.ErrInvalidArgument)
                        }
                }

                updated.UpdatedAt = time.Now().UTC()

                if err := s.write(parent, &updated, true); err != nil {
                        return err
                }

                ret = updated
                return nil
        }); err != nil {
                return api.Sandbox{}, err
        }

        return ret, nil
}

// Get sandbox metadata using the id
func (s *sandboxStore) Get(ctx context.Context, id string) (api.Sandbox, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return api.Sandbox{}, err
        }

        ret := api.Sandbox{}
        if err := view(ctx, s.db, func(tx *bbolt.Tx) error {
                bucket := getSandboxBucket(tx, ns)
                if bucket == nil {
                        return fmt.Errorf("no sandbox buckets: %w", errdefs.ErrNotFound)
                }

                out, err := s.read(bucket, []byte(id))
                if err != nil {
                        return err
                }

                ret = out
                return nil
        }); err != nil {
                return api.Sandbox{}, err
        }

        return ret, nil
}

// List returns sandboxes that match one or more of the provided filters
func (s *sandboxStore) List(ctx context.Context, fields ...string) ([]api.Sandbox, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        filter, err := filters.ParseAll(fields...)
        if err != nil {
                return nil, fmt.Errorf("%s: %w", err.Error(), errdefs.ErrInvalidArgument)
        }

        var (
                list []api.Sandbox
        )

        if err := view(ctx, s.db, func(tx *bbolt.Tx) error {
                bucket := getSandboxBucket(tx, ns)
                if bucket == nil {
                        // We haven't created any sandboxes yet, just return empty list
                        return nil
                }

                if err := bucket.ForEach(func(k, v []byte) error {
                        info, err := s.read(bucket, k)
                        if err != nil {
                                return fmt.Errorf("failed to read bucket %q: %w", string(k), err)
                        }

                        if filter.Match(adaptSandbox(&info)) {
                                list = append(list, info)
                        }

                        return nil
                }); err != nil {
                        return err
                }

                return nil
        }); err != nil {
                return nil, err
        }

        return list, nil
}

// Delete a sandbox from metadata store using the id
func (s *sandboxStore) Delete(ctx context.Context, id string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        if err := update(ctx, s.db, func(tx *bbolt.Tx) error {
                buckets := getSandboxBucket(tx, ns)
                if buckets == nil {
                        return fmt.Errorf("no sandbox buckets: %w", errdefs.ErrNotFound)
                }

                if err := buckets.DeleteBucket([]byte(id)); err != nil {
                        if err == bbolt.ErrBucketNotFound {
                                err = errdefs.ErrNotFound
                        }
                        return fmt.Errorf("failed to delete sandbox %q: %w", id, err)
                }

                return nil
        }); err != nil {
                return err
        }

        return nil
}

func (s *sandboxStore) write(parent *bbolt.Bucket, instance *api.Sandbox, overwrite bool) error {
        if err := s.validate(instance); err != nil {
                return err
        }

        var (
                bucket *bbolt.Bucket
                err    error
                id     = []byte(instance.ID)
        )

        if overwrite {
                bucket, err = parent.CreateBucketIfNotExists(id)
                if err != nil {
                        return err
                }
        } else {
                bucket, err = parent.CreateBucket(id)
                if err != nil {
                        if err == bbolt.ErrBucketExists {
                                return fmt.Errorf("sandbox bucket %q already exists: %w", instance.ID, errdefs.ErrAlreadyExists)
                        }
                        return err
                }
        }

        if err := boltutil.WriteTimestamps(bucket, instance.CreatedAt, instance.UpdatedAt); err != nil {
                return err
        }

        if err := boltutil.WriteLabels(bucket, instance.Labels); err != nil {
                return err
        }

        if err := boltutil.WriteExtensions(bucket, instance.Extensions); err != nil {
                return err
        }

        if err := boltutil.WriteAny(bucket, bucketKeySpec, instance.Spec); err != nil {
                return err
        }

        if err := bucket.Put(bucketKeySandboxer, []byte(instance.Sandboxer)); err != nil {
                return err
        }

        runtimeBucket, err := bucket.CreateBucketIfNotExists(bucketKeyRuntime)
        if err != nil {
                return err
        }

        if err := runtimeBucket.Put(bucketKeyName, []byte(instance.Runtime.Name)); err != nil {
                return err
        }

        if err := boltutil.WriteAny(runtimeBucket, bucketKeyOptions, instance.Runtime.Options); err != nil {
                return err
        }

        return nil
}

func (s *sandboxStore) read(parent *bbolt.Bucket, id []byte) (api.Sandbox, error) {
        var (
                inst api.Sandbox
                err  error
        )

        bucket := parent.Bucket(id)
        if bucket == nil {
                return api.Sandbox{}, fmt.Errorf("bucket %q not found: %w", id, errdefs.ErrNotFound)
        }

        inst.ID = string(id)

        inst.Labels, err = boltutil.ReadLabels(bucket)
        if err != nil {
                return api.Sandbox{}, err
        }

        if err := boltutil.ReadTimestamps(bucket, &inst.CreatedAt, &inst.UpdatedAt); err != nil {
                return api.Sandbox{}, err
        }

        inst.Spec, err = boltutil.ReadAny(bucket, bucketKeySpec)
        if err != nil {
                return api.Sandbox{}, err
        }

        runtimeBucket := bucket.Bucket(bucketKeyRuntime)
        if runtimeBucket == nil {
                return api.Sandbox{}, errors.New("no runtime bucket")
        }

        inst.Runtime.Name = string(runtimeBucket.Get(bucketKeyName))
        inst.Runtime.Options, err = boltutil.ReadAny(runtimeBucket, bucketKeyOptions)
        if err != nil {
                return api.Sandbox{}, err
        }

        inst.Extensions, err = boltutil.ReadExtensions(bucket)
        if err != nil {
                return api.Sandbox{}, err
        }
        sandboxer := bucket.Get(bucketKeySandboxer)
        if sandboxer == nil {
                inst.Sandboxer = ""
        } else {
                inst.Sandboxer = string(sandboxer)
        }

        return inst, nil
}

func (s *sandboxStore) validate(new *api.Sandbox) error {
        if err := identifiers.Validate(new.ID); err != nil {
                return fmt.Errorf("invalid sandbox ID: %w", err)
        }

        if new.CreatedAt.IsZero() {
                return fmt.Errorf("creation date must not be zero: %w", errdefs.ErrInvalidArgument)
        }

        if new.UpdatedAt.IsZero() {
                return fmt.Errorf("updated date must not be zero: %w", errdefs.ErrInvalidArgument)
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metadata

import (
        "context"
        "fmt"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        bolt "go.etcd.io/bbolt"
)

const (
        inheritedLabelsPrefix = "containerd.io/snapshot/"
        labelSnapshotRef      = "containerd.io/snapshot.ref"
)

type snapshotter struct {
        snapshots.Snapshotter
        name string
        db   *DB
        l    sync.RWMutex
}

// newSnapshotter returns a new Snapshotter which namespaces the given snapshot
// using the provided name and database.
func newSnapshotter(db *DB, name string, sn snapshots.Snapshotter) *snapshotter {
        return &snapshotter{
                Snapshotter: sn,
                name:        name,
                db:          db,
        }
}

func createKey(id uint64, namespace, key string) string {
        return fmt.Sprintf("%s/%d/%s", namespace, id, key)
}

func getKey(tx *bolt.Tx, ns, name, key string) string {
        bkt := getSnapshotterBucket(tx, ns, name)
        if bkt == nil {
                return ""
        }
        bkt = bkt.Bucket([]byte(key))
        if bkt == nil {
                return ""
        }
        v := bkt.Get(bucketKeyName)
        if len(v) == 0 {
                return ""
        }
        return string(v)
}

func (s *snapshotter) resolveKey(ctx context.Context, key string) (string, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return "", err
        }

        var id string
        if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                id = getKey(tx, ns, s.name, key)
                if id == "" {
                        return fmt.Errorf("snapshot %v does not exist: %w", key, errdefs.ErrNotFound)
                }
                return nil
        }); err != nil {
                return "", err
        }

        return id, nil
}

func (s *snapshotter) Stat(ctx context.Context, key string) (snapshots.Info, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return snapshots.Info{}, err
        }

        var (
                bkey  string
                local = snapshots.Info{
                        Name: key,
                }
        )
        if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getSnapshotterBucket(tx, ns, s.name)
                if bkt == nil {
                        return fmt.Errorf("snapshot %v does not exist: %w", key, errdefs.ErrNotFound)
                }
                sbkt := bkt.Bucket([]byte(key))
                if sbkt == nil {
                        return fmt.Errorf("snapshot %v does not exist: %w", key, errdefs.ErrNotFound)
                }
                local.Labels, err = boltutil.ReadLabels(sbkt)
                if err != nil {
                        return fmt.Errorf("failed to read labels: %w", err)
                }
                if err := boltutil.ReadTimestamps(sbkt, &local.Created, &local.Updated); err != nil {
                        return fmt.Errorf("failed to read timestamps: %w", err)
                }
                bkey = string(sbkt.Get(bucketKeyName))
                local.Parent = string(sbkt.Get(bucketKeyParent))

                return nil
        }); err != nil {
                return snapshots.Info{}, err
        }

        info, err := s.Snapshotter.Stat(ctx, bkey)
        if err != nil {
                return snapshots.Info{}, err
        }

        return overlayInfo(info, local), nil
}

func (s *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (snapshots.Info, error) {
        s.l.RLock()
        defer s.l.RUnlock()

        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return snapshots.Info{}, err
        }

        if info.Name == "" {
                return snapshots.Info{}, errdefs.ErrInvalidArgument
        }

        var (
                bkey  string
                local = snapshots.Info{
                        Name: info.Name,
                }
                updated bool
        )
        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getSnapshotterBucket(tx, ns, s.name)
                if bkt == nil {
                        return fmt.Errorf("snapshot %v does not exist: %w", info.Name, errdefs.ErrNotFound)
                }
                sbkt := bkt.Bucket([]byte(info.Name))
                if sbkt == nil {
                        return fmt.Errorf("snapshot %v does not exist: %w", info.Name, errdefs.ErrNotFound)
                }

                local.Labels, err = boltutil.ReadLabels(sbkt)
                if err != nil {
                        return fmt.Errorf("failed to read labels: %w", err)
                }
                if err := boltutil.ReadTimestamps(sbkt, &local.Created, &local.Updated); err != nil {
                        return fmt.Errorf("failed to read timestamps: %w", err)
                }

                // Handle field updates
                if len(fieldpaths) > 0 {
                        for _, path := range fieldpaths {
                                if strings.HasPrefix(path, "labels.") {
                                        if local.Labels == nil {
                                                local.Labels = map[string]string{}
                                        }

                                        key := strings.TrimPrefix(path, "labels.")
                                        local.Labels[key] = info.Labels[key]
                                        continue
                                }

                                switch path {
                                case "labels":
                                        local.Labels = info.Labels
                                default:
                                        return fmt.Errorf("cannot update %q field on snapshot %q: %w", path, info.Name, errdefs.ErrInvalidArgument)
                                }
                        }
                } else {
                        local.Labels = info.Labels
                }
                if err := validateSnapshot(&local); err != nil {
                        return err
                }
                local.Updated = time.Now().UTC()

                if err := boltutil.WriteTimestamps(sbkt, local.Created, local.Updated); err != nil {
                        return fmt.Errorf("failed to read timestamps: %w", err)
                }
                if err := boltutil.WriteLabels(sbkt, local.Labels); err != nil {
                        return fmt.Errorf("failed to read labels: %w", err)
                }
                bkey = string(sbkt.Get(bucketKeyName))
                local.Parent = string(sbkt.Get(bucketKeyParent))

                inner := snapshots.Info{
                        Name:   bkey,
                        Labels: snapshots.FilterInheritedLabels(local.Labels),
                }

                // NOTE: Perform this inside the transaction to reduce the
                // chances of out of sync data. The backend snapshotters
                // should perform the Update as fast as possible.
                if info, err = s.Snapshotter.Update(ctx, inner, fieldpaths...); err != nil {
                        return err
                }
                updated = true

                return nil
        }); err != nil {
                if updated {
                        log.G(ctx).WithField("snapshotter", s.name).WithField("key", local.Name).WithError(err).Error("transaction failed after updating snapshot backend")
                }
                return snapshots.Info{}, err
        }

        return overlayInfo(info, local), nil
}

func overlayInfo(info, overlay snapshots.Info) snapshots.Info {
        // Merge info
        info.Name = overlay.Name
        info.Created = overlay.Created
        info.Updated = overlay.Updated
        info.Parent = overlay.Parent
        if info.Labels == nil {
                info.Labels = overlay.Labels
        } else {
                for k, v := range overlay.Labels {
                        info.Labels[k] = v
                }
        }
        return info
}

func (s *snapshotter) Usage(ctx context.Context, key string) (snapshots.Usage, error) {
        bkey, err := s.resolveKey(ctx, key)
        if err != nil {
                return snapshots.Usage{}, err
        }
        return s.Snapshotter.Usage(ctx, bkey)
}

func (s *snapshotter) Mounts(ctx context.Context, key string) ([]mount.Mount, error) {
        bkey, err := s.resolveKey(ctx, key)
        if err != nil {
                return nil, err
        }
        return s.Snapshotter.Mounts(ctx, bkey)
}

func (s *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        mounts, err := s.createSnapshot(ctx, key, parent, false, opts)
        if err != nil {
                return nil, err
        }

        if publisher := s.db.Publisher(ctx); publisher != nil {
                if err := publisher.Publish(ctx, "/snapshot/prepare", &eventstypes.SnapshotPrepare{
                        Key:         key,
                        Parent:      parent,
                        Snapshotter: s.name,
                }); err != nil {
                        return nil, err
                }
        }

        return mounts, nil
}

func (s *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return s.createSnapshot(ctx, key, parent, true, opts)
}

func (s *snapshotter) createSnapshot(ctx context.Context, key, parent string, readonly bool, opts []snapshots.Opt) ([]mount.Mount, error) {
        s.l.RLock()
        defer s.l.RUnlock()

        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }

        var base snapshots.Info
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        return nil, err
                }
        }

        if err := validateSnapshot(&base); err != nil {
                return nil, err
        }

        var (
                target  = base.Labels[labelSnapshotRef]
                bparent string
                bkey    string
                bopts   = []snapshots.Opt{
                        snapshots.WithLabels(snapshots.FilterInheritedLabels(base.Labels)),
                }
                rerr error
        )

        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt, err := createSnapshotterBucket(tx, ns, s.name)
                if err != nil {
                        return err
                }

                // Check if target exists, if so, return already exists
                if target != "" {
                        if tbkt := bkt.Bucket([]byte(target)); tbkt != nil {
                                rerr = fmt.Errorf("target snapshot %q: %w", target, errdefs.ErrAlreadyExists)
                                if err := addSnapshotLease(ctx, tx, s.name, target); err != nil {
                                        return err
                                }
                                return nil
                        }
                }

                if bbkt := bkt.Bucket([]byte(key)); bbkt != nil {
                        rerr = fmt.Errorf("snapshot %q: %w", key, errdefs.ErrAlreadyExists)
                        if err := addSnapshotLease(ctx, tx, s.name, key); err != nil {
                                return err
                        }
                        return nil
                }

                if parent != "" {
                        pbkt := bkt.Bucket([]byte(parent))
                        if pbkt == nil {
                                return fmt.Errorf("parent snapshot %v does not exist: %w", parent, errdefs.ErrNotFound)
                        }
                        bparent = string(pbkt.Get(bucketKeyName))
                }

                sid, err := bkt.NextSequence()
                if err != nil {
                        return err
                }
                bkey = createKey(sid, ns, key)

                return err
        }); err != nil {
                return nil, err
        }
        // Already exists and lease successfully added in transaction
        if rerr != nil {
                return nil, rerr
        }

        var (
                m       []mount.Mount
                created string
        )
        if readonly {
                m, err = s.Snapshotter.View(ctx, bkey, bparent, bopts...)
        } else {
                m, err = s.Snapshotter.Prepare(ctx, bkey, bparent, bopts...)
        }

        // An already exists error should indicate the backend found a snapshot
        // matching a provided target reference.
        if errdefs.IsAlreadyExists(err) {
                if target != "" {
                        var tinfo *snapshots.Info
                        filter := fmt.Sprintf(`labels."containerd.io/snapshot.ref"==%s,parent==%q`, target, bparent)
                        if err := s.Snapshotter.Walk(ctx, func(ctx context.Context, i snapshots.Info) error {
                                if tinfo == nil && i.Kind == snapshots.KindCommitted {
                                        if i.Labels["containerd.io/snapshot.ref"] != target {
                                                // Walk did not respect filter
                                                return nil
                                        }
                                        if i.Parent != bparent {
                                                // Walk did not respect filter
                                                return nil
                                        }
                                        tinfo = &i
                                }
                                return nil

                        }, filter); err != nil {
                                return nil, fmt.Errorf("failed walking backend snapshots: %w", err)
                        }

                        if tinfo == nil {
                                return nil, fmt.Errorf("target snapshot %q in backend: %w", target, errdefs.ErrNotFound)
                        }

                        key = target
                        bkey = tinfo.Name
                        bparent = tinfo.Parent
                        base.Created = tinfo.Created
                        base.Updated = tinfo.Updated
                        if base.Labels == nil {
                                base.Labels = tinfo.Labels
                        } else {
                                for k, v := range tinfo.Labels {
                                        if _, ok := base.Labels[k]; !ok {
                                                base.Labels[k] = v
                                        }
                                }
                        }

                        // Propagate this error after the final update
                        rerr = fmt.Errorf("target snapshot %q from snapshotter: %w", target, errdefs.ErrAlreadyExists)
                } else {
                        // This condition is unexpected as the key provided is expected
                        // to be new and unique, return as unknown response from backend
                        // to avoid confusing callers handling already exists.
                        return nil, fmt.Errorf("unexpected error from snapshotter: %v: %w", err, errdefs.ErrUnknown)
                }
        } else if err != nil {
                return nil, err
        } else {
                ts := time.Now().UTC()
                base.Created = ts
                base.Updated = ts
                created = bkey
        }

        if txerr := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getSnapshotterBucket(tx, ns, s.name)
                if bkt == nil {
                        return fmt.Errorf("can not find snapshotter %q: %w", s.name, errdefs.ErrNotFound)
                }

                if err := addSnapshotLease(ctx, tx, s.name, key); err != nil {
                        return err
                }

                bbkt, err := bkt.CreateBucket([]byte(key))
                if err != nil {
                        if err != bolt.ErrBucketExists {
                                return err
                        }
                        if rerr == nil {
                                rerr = fmt.Errorf("snapshot %q: %w", key, errdefs.ErrAlreadyExists)
                        }
                        return nil
                }

                if parent != "" {
                        pbkt := bkt.Bucket([]byte(parent))
                        if pbkt == nil {
                                return fmt.Errorf("parent snapshot %v does not exist: %w", parent, errdefs.ErrNotFound)
                        }

                        // Ensure the backend's parent matches the metadata store's parent
                        // If it is mismatched, then a target was provided for a snapshotter
                        // which has a different parent then requested.
                        // NOTE: The backend snapshotter is responsible for enforcing the
                        // uniqueness of the reference relationships, the metadata store
                        // can only error out to prevent inconsistent data.
                        if bparent != string(pbkt.Get(bucketKeyName)) {
                                return fmt.Errorf("mismatched parent %s from target %s: %w", parent, target, errdefs.ErrInvalidArgument)
                        }

                        cbkt, err := pbkt.CreateBucketIfNotExists(bucketKeyChildren)
                        if err != nil {
                                return err
                        }
                        if err := cbkt.Put([]byte(key), nil); err != nil {
                                return err
                        }

                        if err := bbkt.Put(bucketKeyParent, []byte(parent)); err != nil {
                                return err
                        }
                }

                if err := boltutil.WriteTimestamps(bbkt, base.Created, base.Updated); err != nil {
                        return err
                }
                if err := boltutil.WriteLabels(bbkt, base.Labels); err != nil {
                        return err
                }

                return bbkt.Put(bucketKeyName, []byte(bkey))
        }); txerr != nil {
                rerr = txerr
        }

        if rerr != nil {
                // If the created reference is not stored, attempt clean up
                if created != "" {
                        if err := s.Snapshotter.Remove(ctx, created); err != nil {
                                log.G(ctx).WithField("snapshotter", s.name).WithField("key", created).WithError(err).Error("failed to cleanup unreferenced snapshot")
                        }
                }
                return nil, rerr
        }

        return m, nil
}

func (s *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
        s.l.RLock()
        defer s.l.RUnlock()

        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        var base snapshots.Info
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        return err
                }
        }

        if err := validateSnapshot(&base); err != nil {
                return err
        }

        var (
                bname string
                rerr  error
        )
        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                bkt := getSnapshotterBucket(tx, ns, s.name)
                if bkt == nil {
                        return fmt.Errorf("can not find snapshotter %q: %w",
                                s.name, errdefs.ErrNotFound)
                }

                if err := addSnapshotLease(ctx, tx, s.name, name); err != nil {
                        return err
                }
                bbkt, err := bkt.CreateBucket([]byte(name))
                if err != nil {
                        if err == bolt.ErrBucketExists {
                                rerr = fmt.Errorf("snapshot %q: %w", name, errdefs.ErrAlreadyExists)
                                return nil
                        }
                        return err
                }

                obkt := bkt.Bucket([]byte(key))
                if obkt == nil {
                        return fmt.Errorf("snapshot %v does not exist: %w", key, errdefs.ErrNotFound)
                }

                bkey := string(obkt.Get(bucketKeyName))

                sid, err := bkt.NextSequence()
                if err != nil {
                        return err
                }

                nameKey := createKey(sid, ns, name)

                if err := bbkt.Put(bucketKeyName, []byte(nameKey)); err != nil {
                        return err
                }

                parent := obkt.Get(bucketKeyParent)
                if len(parent) > 0 {
                        pbkt := bkt.Bucket(parent)
                        if pbkt == nil {
                                return fmt.Errorf("parent snapshot %v does not exist: %w", string(parent), errdefs.ErrNotFound)
                        }

                        cbkt, err := pbkt.CreateBucketIfNotExists(bucketKeyChildren)
                        if err != nil {
                                return err
                        }
                        if err := cbkt.Delete([]byte(key)); err != nil {
                                return err
                        }
                        if err := cbkt.Put([]byte(name), nil); err != nil {
                                return err
                        }

                        if err := bbkt.Put(bucketKeyParent, parent); err != nil {
                                return err
                        }
                }
                ts := time.Now().UTC()
                if err := boltutil.WriteTimestamps(bbkt, ts, ts); err != nil {
                        return err
                }
                if err := boltutil.WriteLabels(bbkt, base.Labels); err != nil {
                        return err
                }

                if err := bkt.DeleteBucket([]byte(key)); err != nil {
                        return err
                }
                if err := removeSnapshotLease(ctx, tx, s.name, key); err != nil {
                        return err
                }

                inheritedOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(base.Labels))

                // NOTE: Backend snapshotters should commit fast and reliably to
                // prevent metadata store locking and minimizing rollbacks.
                // This operation should be done in the transaction to minimize the
                // risk of the committed keys becoming out of sync. If this operation
                // succeed and the overall transaction fails then the risk of out of
                // sync data is higher and may require manual cleanup.
                if err := s.Snapshotter.Commit(ctx, nameKey, bkey, inheritedOpt); err != nil {
                        if errdefs.IsNotFound(err) {
                                log.G(ctx).WithField("snapshotter", s.name).WithField("key", key).WithError(err).Error("uncommittable snapshot: missing in backend, snapshot should be removed")
                        }
                        // NOTE: Consider handling already exists here from the backend. Currently
                        // already exists from the backend may be confusing to the client since it
                        // may require the client to re-attempt from prepare. However, if handling
                        // here it is not clear what happened with the existing backend key and
                        // whether the already prepared snapshot would still be used or must be
                        // discarded. It is best that all implementations of the snapshotter
                        // interface behave the same, in which case the backend should handle the
                        // mapping of duplicates and not error.
                        return err
                }
                bname = nameKey

                return nil
        }); err != nil {
                if bname != "" {
                        log.G(ctx).WithField("snapshotter", s.name).WithField("key", key).WithField("bname", bname).WithError(err).Error("uncommittable snapshot: transaction failed after commit, snapshot should be removed")

                }
                return err
        }

        if rerr == nil {
                if publisher := s.db.Publisher(ctx); publisher != nil {
                        if err := publisher.Publish(ctx, "/snapshot/commit", &eventstypes.SnapshotCommit{
                                Key:         key,
                                Name:        name,
                                Snapshotter: s.name,
                        }); err != nil {
                                return err
                        }
                }
        }

        return rerr

}

func (s *snapshotter) Remove(ctx context.Context, key string) error {
        s.l.RLock()
        defer s.l.RUnlock()

        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        if err := update(ctx, s.db, func(tx *bolt.Tx) error {
                var sbkt *bolt.Bucket
                bkt := getSnapshotterBucket(tx, ns, s.name)
                if bkt != nil {
                        sbkt = bkt.Bucket([]byte(key))
                }
                if sbkt == nil {
                        return fmt.Errorf("snapshot %v does not exist: %w", key, errdefs.ErrNotFound)
                }

                cbkt := sbkt.Bucket(bucketKeyChildren)
                if cbkt != nil {
                        if child, _ := cbkt.Cursor().First(); child != nil {
                                return fmt.Errorf("cannot remove snapshot with child: %w", errdefs.ErrFailedPrecondition)
                        }
                }

                parent := sbkt.Get(bucketKeyParent)
                if len(parent) > 0 {
                        pbkt := bkt.Bucket(parent)
                        if pbkt == nil {
                                return fmt.Errorf("parent snapshot %v does not exist: %w", string(parent), errdefs.ErrNotFound)
                        }
                        cbkt := pbkt.Bucket(bucketKeyChildren)
                        if cbkt != nil {
                                if err := cbkt.Delete([]byte(key)); err != nil {
                                        return fmt.Errorf("failed to remove child link: %w", err)
                                }
                        }
                }

                if err := bkt.DeleteBucket([]byte(key)); err != nil {
                        return err
                }
                if err := removeSnapshotLease(ctx, tx, s.name, key); err != nil {
                        return err
                }

                // Mark snapshotter as dirty for triggering garbage collection
                atomic.AddUint32(&s.db.dirty, 1)
                s.db.dirtySS[s.name] = struct{}{}

                return nil
        }); err != nil {
                return err
        }

        if publisher := s.db.Publisher(ctx); publisher != nil {
                return publisher.Publish(ctx, "/snapshot/remove", &eventstypes.SnapshotRemove{
                        Key:         key,
                        Snapshotter: s.name,
                })
        }
        return nil
}

type infoPair struct {
        bkey string
        info snapshots.Info
}

func (s *snapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        var (
                batchSize = 100
                pairs     = []infoPair{}
                lastKey   string
        )

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return err
        }

        for {
                if err := view(ctx, s.db, func(tx *bolt.Tx) error {
                        bkt := getSnapshotterBucket(tx, ns, s.name)
                        if bkt == nil {
                                return nil
                        }

                        c := bkt.Cursor()

                        var k, v []byte
                        if lastKey == "" {
                                k, v = c.First()
                        } else {
                                k, v = c.Seek([]byte(lastKey))
                        }

                        for k != nil {
                                if v == nil {
                                        if len(pairs) >= batchSize {
                                                break
                                        }
                                        sbkt := bkt.Bucket(k)

                                        pair := infoPair{
                                                bkey: string(sbkt.Get(bucketKeyName)),
                                                info: snapshots.Info{
                                                        Name:   string(k),
                                                        Parent: string(sbkt.Get(bucketKeyParent)),
                                                },
                                        }

                                        err := boltutil.ReadTimestamps(sbkt, &pair.info.Created, &pair.info.Updated)
                                        if err != nil {
                                                return err
                                        }
                                        pair.info.Labels, err = boltutil.ReadLabels(sbkt)
                                        if err != nil {
                                                return err
                                        }

                                        pairs = append(pairs, pair)
                                }

                                k, v = c.Next()
                        }

                        lastKey = string(k)

                        return nil
                }); err != nil {
                        return err
                }

                for _, pair := range pairs {
                        info, err := s.Snapshotter.Stat(ctx, pair.bkey)
                        if err != nil {
                                if errdefs.IsNotFound(err) {
                                        continue
                                }
                                return err
                        }

                        info = overlayInfo(info, pair.info)
                        if filter.Match(adaptSnapshot(info)) {
                                if err := fn(ctx, info); err != nil {
                                        return err
                                }
                        }
                }

                if lastKey == "" {
                        break
                }

                pairs = pairs[:0]

        }

        return nil
}

func validateSnapshot(info *snapshots.Info) error {
        for k, v := range info.Labels {
                if err := labels.Validate(k, v); err != nil {
                        return fmt.Errorf("info.Labels: %w", err)
                }
        }

        return nil
}

// garbageCollect removes all snapshots that are no longer used.
func (s *snapshotter) garbageCollect(ctx context.Context) (d time.Duration, err error) {
        s.l.Lock()
        t1 := time.Now()
        defer func() {
                s.l.Unlock()
                if err == nil {
                        if c, ok := s.Snapshotter.(snapshots.Cleaner); ok {
                                err = c.Cleanup(ctx)
                                if errdefs.IsNotImplemented(err) {
                                        err = nil
                                }
                        }
                }
                if err == nil {
                        d = time.Since(t1)
                }
        }()

        seen := map[string]struct{}{}
        if err := s.db.View(func(tx *bolt.Tx) error {
                v1bkt := tx.Bucket(bucketKeyVersion)
                if v1bkt == nil {
                        return nil
                }

                // iterate through each namespace
                v1c := v1bkt.Cursor()

                for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
                        if v != nil {
                                continue
                        }

                        sbkt := v1bkt.Bucket(k).Bucket(bucketKeyObjectSnapshots)
                        if sbkt == nil {
                                continue
                        }

                        // Load specific snapshotter
                        ssbkt := sbkt.Bucket([]byte(s.name))
                        if ssbkt == nil {
                                continue
                        }

                        if err := ssbkt.ForEach(func(sk, sv []byte) error {
                                if sv == nil {
                                        bkey := ssbkt.Bucket(sk).Get(bucketKeyName)
                                        if len(bkey) > 0 {
                                                seen[string(bkey)] = struct{}{}
                                        }
                                }
                                return nil
                        }); err != nil {
                                return err
                        }
                }

                return nil
        }); err != nil {
                return 0, err
        }

        roots, err := s.walkTree(ctx, seen)
        if err != nil {
                return 0, err
        }

        // TODO: Unlock before removal (once nodes are fully unavailable).
        // This could be achieved through doing prune inside the lock
        // and having a cleanup method which actually performs the
        // deletions on the snapshotters which support it.

        for _, node := range roots {
                if err := s.pruneBranch(ctx, node); err != nil {
                        return 0, err
                }
        }

        return
}

type treeNode struct {
        info     snapshots.Info
        remove   bool
        children []*treeNode
}

func (s *snapshotter) walkTree(ctx context.Context, seen map[string]struct{}) ([]*treeNode, error) {
        roots := []*treeNode{}
        nodes := map[string]*treeNode{}

        if err := s.Snapshotter.Walk(ctx, func(ctx context.Context, info snapshots.Info) error {
                _, isSeen := seen[info.Name]
                node, ok := nodes[info.Name]
                if !ok {
                        node = &treeNode{}
                        nodes[info.Name] = node
                }

                node.remove = !isSeen
                node.info = info

                if info.Parent == "" {
                        roots = append(roots, node)
                } else {
                        parent, ok := nodes[info.Parent]
                        if !ok {
                                parent = &treeNode{}
                                nodes[info.Parent] = parent
                        }
                        parent.children = append(parent.children, node)
                }

                return nil
        }); err != nil {
                return nil, err
        }

        return roots, nil
}

func (s *snapshotter) pruneBranch(ctx context.Context, node *treeNode) error {
        for _, child := range node.children {
                if err := s.pruneBranch(ctx, child); err != nil {
                        return err
                }
        }

        if node.remove {
                logger := log.G(ctx).WithField("snapshotter", s.name)
                if err := s.Snapshotter.Remove(ctx, node.info.Name); err != nil {
                        if !errdefs.IsFailedPrecondition(err) {
                                return err
                        }
                        logger.WithError(err).WithField("key", node.info.Name).Warnf("failed to remove snapshot")
                } else {
                        logger.WithField("key", node.info.Name).Debug("removed snapshot")
                }
        }

        return nil
}

// Close closes s.Snapshotter but not db
func (s *snapshotter) Close() error {
        return s.Snapshotter.Close()
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package cgroups

import (
        "context"

        "github.com/containerd/cgroups/v3"
        "github.com/containerd/containerd/v2/core/events"
        v1 "github.com/containerd/containerd/v2/core/metrics/cgroups/v1"
        v2 "github.com/containerd/containerd/v2/core/metrics/cgroups/v2"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        metrics "github.com/docker/go-metrics"
)

// Config for the cgroups monitor
type Config struct {
        NoPrometheus bool `toml:"no_prometheus"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.TaskMonitorPlugin,
                ID:     "cgroups",
                InitFn: New,
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                },
                Config: &Config{},
                ConfigMigration: func(ctx context.Context, configVersion int, pluginConfigs map[string]interface{}) error {
                        if configVersion >= version.ConfigVersion {
                                return nil
                        }
                        // Previous plugin name
                        const pluginName = "io.containerd.monitor.v1.cgroups"
                        c, ok := pluginConfigs[pluginName]
                        if ok {
                                pluginConfigs[string(plugins.TaskMonitorPlugin)+".cgroups"] = c
                                delete(pluginConfigs, pluginName)
                        }

                        return nil
                },
        })
}

// New returns a new cgroups monitor
func New(ic *plugin.InitContext) (interface{}, error) {
        var ns *metrics.Namespace
        config := ic.Config.(*Config)
        if !config.NoPrometheus {
                ns = metrics.NewNamespace("container", "", nil)
        }
        var (
                tm  runtime.TaskMonitor
                err error
        )

        ep, err := ic.GetSingle(plugins.EventPlugin)
        if err != nil {
                return nil, err
        }

        if cgroups.Mode() == cgroups.Unified {
                tm, err = v2.NewTaskMonitor(ic.Context, ep.(events.Publisher), ns)
        } else {
                tm, err = v1.NewTaskMonitor(ic.Context, ep.(events.Publisher), ns)
        }
        if err != nil {
                return nil, err
        }
        if ns != nil {
                metrics.Register(ns)
        }
        ic.Meta.Platforms = append(ic.Meta.Platforms, platforms.DefaultSpec())
        return tm, nil
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        "strconv"

        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var blkioMetrics = []*metric{
        {
                name:   "blkio_io_merged_recursive",
                help:   "The blkio io merged recursive",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.IoMergedRecursive)
                },
        },
        {
                name:   "blkio_io_queued_recursive",
                help:   "The blkio io queued recursive",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.IoQueuedRecursive)
                },
        },
        {
                name:   "blkio_io_service_bytes_recursive",
                help:   "The blkio io service bytes recursive",
                unit:   metrics.Bytes,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.IoServiceBytesRecursive)
                },
        },
        {
                name:   "blkio_io_service_time_recursive",
                help:   "The blkio io service time recursive",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.IoServiceTimeRecursive)
                },
        },
        {
                name:   "blkio_io_serviced_recursive",
                help:   "The blkio io serviced recursive",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.IoServicedRecursive)
                },
        },
        {
                name:   "blkio_io_time_recursive",
                help:   "The blkio io time recursive",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.IoTimeRecursive)
                },
        },
        {
                name:   "blkio_sectors_recursive",
                help:   "The blkio sectors recursive",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"op", "device", "major", "minor"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Blkio == nil {
                                return nil
                        }
                        return blkioValues(stats.Blkio.SectorsRecursive)
                },
        },
}

func blkioValues(l []*v1.BlkIOEntry) []value {
        var out []value
        for _, e := range l {
                out = append(out, value{
                        v: float64(e.Value),
                        l: []string{e.Op, e.Device, strconv.FormatUint(e.Major, 10), strconv.FormatUint(e.Minor, 10)},
                })
        }
        return out
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        "context"

        cgroups "github.com/containerd/cgroups/v3/cgroup1"
        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/docker/go-metrics"
)

// NewTaskMonitor returns a new cgroups monitor
func NewTaskMonitor(ctx context.Context, publisher events.Publisher, ns *metrics.Namespace) (runtime.TaskMonitor, error) {
        collector := NewCollector(ns)
        oom, err := newOOMCollector(ns)
        if err != nil {
                return nil, err
        }
        return &cgroupsMonitor{
                collector: collector,
                oom:       oom,
                context:   ctx,
                publisher: publisher,
        }, nil
}

type cgroupsMonitor struct {
        collector *Collector
        oom       *oomCollector
        context   context.Context
        publisher events.Publisher
}

type cgroupTask interface {
        Cgroup() (cgroups.Cgroup, error)
}

func (m *cgroupsMonitor) Monitor(c runtime.Task, labels map[string]string) error {
        if err := m.collector.Add(c, labels); err != nil {
                return err
        }
        t, ok := c.(cgroupTask)
        if !ok {
                return nil
        }
        cg, err := t.Cgroup()
        if err != nil {
                if errdefs.IsNotFound(err) {
                        return nil
                }
                return err
        }
        err = m.oom.Add(c.ID(), c.Namespace(), cg, m.trigger)
        if err == cgroups.ErrMemoryNotSupported {
                log.L.WithError(err).Warn("OOM monitoring failed")
                return nil
        }
        return err
}

func (m *cgroupsMonitor) Stop(c runtime.Task) error {
        m.collector.Remove(c)
        return nil
}

func (m *cgroupsMonitor) trigger(id, namespace string, cg cgroups.Cgroup) {
        ctx := namespaces.WithNamespace(m.context, namespace)
        if err := m.publisher.Publish(ctx, runtime.TaskOOMEventTopic, &eventstypes.TaskOOM{
                ContainerID: id,
        }); err != nil {
                log.G(m.context).WithError(err).Error("post OOM event")
        }
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        "strconv"

        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var cpuMetrics = []*metric{
        {
                name: "cpu_total",
                help: "The total cpu time",
                unit: metrics.Nanoseconds,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.Usage.Total),
                                },
                        }
                },
        },
        {
                name: "cpu_kernel",
                help: "The total kernel cpu time",
                unit: metrics.Nanoseconds,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.Usage.Kernel),
                                },
                        }
                },
        },
        {
                name: "cpu_user",
                help: "The total user cpu time",
                unit: metrics.Nanoseconds,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.Usage.User),
                                },
                        }
                },
        },
        {
                name:   "per_cpu",
                help:   "The total cpu time per cpu",
                unit:   metrics.Nanoseconds,
                vt:     prometheus.GaugeValue,
                labels: []string{"cpu"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        var out []value
                        for i, v := range stats.CPU.Usage.PerCPU {
                                out = append(out, value{
                                        v: float64(v),
                                        l: []string{strconv.Itoa(i)},
                                })
                        }
                        return out
                },
        },
        {
                name: "cpu_throttle_periods",
                help: "The total cpu throttle periods",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.Throttling.Periods),
                                },
                        }
                },
        },
        {
                name: "cpu_throttled_periods",
                help: "The total cpu throttled periods",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.Throttling.ThrottledPeriods),
                                },
                        }
                },
        },
        {
                name: "cpu_throttled_time",
                help: "The total cpu throttled time",
                unit: metrics.Nanoseconds,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.Throttling.ThrottledTime),
                                },
                        }
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var hugetlbMetrics = []*metric{
        {
                name:   "hugetlb_usage",
                help:   "The hugetlb usage",
                unit:   metrics.Bytes,
                vt:     prometheus.GaugeValue,
                labels: []string{"page"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Hugetlb == nil {
                                return nil
                        }
                        var out []value
                        for _, v := range stats.Hugetlb {
                                out = append(out, value{
                                        v: float64(v.Usage),
                                        l: []string{v.Pagesize},
                                })
                        }
                        return out
                },
        },
        {
                name:   "hugetlb_failcnt",
                help:   "The hugetlb failcnt",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"page"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Hugetlb == nil {
                                return nil
                        }
                        var out []value
                        for _, v := range stats.Hugetlb {
                                out = append(out, value{
                                        v: float64(v.Failcnt),
                                        l: []string{v.Pagesize},
                                })
                        }
                        return out
                },
        },
        {
                name:   "hugetlb_max",
                help:   "The hugetlb maximum usage",
                unit:   metrics.Bytes,
                vt:     prometheus.GaugeValue,
                labels: []string{"page"},
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Hugetlb == nil {
                                return nil
                        }
                        var out []value
                        for _, v := range stats.Hugetlb {
                                out = append(out, value{
                                        v: float64(v.Max),
                                        l: []string{v.Pagesize},
                                })
                        }
                        return out
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var memoryMetrics = []*metric{
        {
                name: "memory_cache",
                help: "The cache amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Cache),
                                },
                        }
                },
        },
        {
                name: "memory_rss",
                help: "The rss amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.RSS),
                                },
                        }
                },
        },
        {
                name: "memory_rss_huge",
                help: "The rss_huge amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.RSSHuge),
                                },
                        }
                },
        },
        {
                name: "memory_mapped_file",
                help: "The mapped_file amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.MappedFile),
                                },
                        }
                },
        },
        {
                name: "memory_dirty",
                help: "The dirty amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Dirty),
                                },
                        }
                },
        },
        {
                name: "memory_writeback",
                help: "The writeback amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Writeback),
                                },
                        }
                },
        },
        {
                name: "memory_pgpgin",
                help: "The pgpgin amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.PgPgIn),
                                },
                        }
                },
        },
        {
                name: "memory_pgpgout",
                help: "The pgpgout amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.PgPgOut),
                                },
                        }
                },
        },
        {
                name: "memory_pgfault",
                help: "The pgfault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.PgFault),
                                },
                        }
                },
        },
        {
                name: "memory_pgmajfault",
                help: "The pgmajfault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.PgMajFault),
                                },
                        }
                },
        },
        {
                name: "memory_inactive_anon",
                help: "The inactive_anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.InactiveAnon),
                                },
                        }
                },
        },
        {
                name: "memory_active_anon",
                help: "The active_anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.ActiveAnon),
                                },
                        }
                },
        },
        {
                name: "memory_inactive_file",
                help: "The inactive_file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.InactiveFile),
                                },
                        }
                },
        },
        {
                name: "memory_active_file",
                help: "The active_file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.ActiveFile),
                                },
                        }
                },
        },
        {
                name: "memory_unevictable",
                help: "The unevictable amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Unevictable),
                                },
                        }
                },
        },
        {
                name: "memory_hierarchical_memory_limit",
                help: "The hierarchical_memory_limit amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.HierarchicalMemoryLimit),
                                },
                        }
                },
        },
        {
                name: "memory_hierarchical_memsw_limit",
                help: "The hierarchical_memsw_limit amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.HierarchicalSwapLimit),
                                },
                        }
                },
        },
        {
                name: "memory_total_cache",
                help: "The total_cache amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalCache),
                                },
                        }
                },
        },
        {
                name: "memory_total_rss",
                help: "The total_rss amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalRSS),
                                },
                        }
                },
        },
        {
                name: "memory_total_rss_huge",
                help: "The total_rss_huge amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalRSSHuge),
                                },
                        }
                },
        },
        {
                name: "memory_total_mapped_file",
                help: "The total_mapped_file amount used",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalMappedFile),
                                },
                        }
                },
        },
        {
                name: "memory_total_dirty",
                help: "The total_dirty amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalDirty),
                                },
                        }
                },
        },
        {
                name: "memory_total_writeback",
                help: "The total_writeback amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalWriteback),
                                },
                        }
                },
        },
        {
                name: "memory_total_pgpgin",
                help: "The total_pgpgin amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalPgPgIn),
                                },
                        }
                },
        },
        {
                name: "memory_total_pgpgout",
                help: "The total_pgpgout amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalPgPgOut),
                                },
                        }
                },
        },
        {
                name: "memory_total_pgfault",
                help: "The total_pgfault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalPgFault),
                                },
                        }
                },
        },
        {
                name: "memory_total_pgmajfault",
                help: "The total_pgmajfault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalPgMajFault),
                                },
                        }
                },
        },
        {
                name: "memory_total_inactive_anon",
                help: "The total_inactive_anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalInactiveAnon),
                                },
                        }
                },
        },
        {
                name: "memory_total_active_anon",
                help: "The total_active_anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalActiveAnon),
                                },
                        }
                },
        },
        {
                name: "memory_total_inactive_file",
                help: "The total_inactive_file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalInactiveFile),
                                },
                        }
                },
        },
        {
                name: "memory_total_active_file",
                help: "The total_active_file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalActiveFile),
                                },
                        }
                },
        },
        {
                name: "memory_total_unevictable",
                help: "The total_unevictable amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.TotalUnevictable),
                                },
                        }
                },
        },
        {
                name: "memory_usage_failcnt",
                help: "The usage failcnt",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetUsage() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Usage.Failcnt),
                                },
                        }
                },
        },
        {
                name: "memory_usage_limit",
                help: "The memory limit",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetUsage() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Usage.Limit),
                                },
                        }
                },
        },
        {
                name: "memory_usage_max",
                help: "The memory maximum usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetUsage() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Usage.Max),
                                },
                        }
                },
        },
        {
                name: "memory_usage_usage",
                help: "The memory usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetUsage() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Usage.Usage),
                                },
                        }
                },
        },
        {
                name: "memory_swap_failcnt",
                help: "The swap failcnt",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetSwap() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Swap.Failcnt),
                                },
                        }
                },
        },
        {
                name: "memory_swap_limit",
                help: "The swap limit",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetSwap() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Swap.Limit),
                                },
                        }
                },
        },
        {
                name: "memory_swap_max",
                help: "The swap maximum usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetSwap() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Swap.Max),
                                },
                        }
                },
        },
        {
                name: "memory_swap_usage",
                help: "The swap usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetSwap() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Swap.Usage),
                                },
                        }
                },
        },
        {
                name: "memory_kernel_failcnt",
                help: "The kernel failcnt",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernel() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Kernel.Failcnt),
                                },
                        }
                },
        },
        {
                name: "memory_kernel_limit",
                help: "The kernel limit",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernel() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Kernel.Limit),
                                },
                        }
                },
        },
        {
                name: "memory_kernel_max",
                help: "The kernel maximum usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernel() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Kernel.Max),
                                },
                        }
                },
        },
        {
                name: "memory_kernel_usage",
                help: "The kernel usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernel() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Kernel.Usage),
                                },
                        }
                },
        },
        {
                name: "memory_kerneltcp_failcnt",
                help: "The kerneltcp failcnt",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernelTCP() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.KernelTCP.Failcnt),
                                },
                        }
                },
        },
        {
                name: "memory_kerneltcp_limit",
                help: "The kerneltcp limit",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernelTCP() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.KernelTCP.Limit),
                                },
                        }
                },
        },
        {
                name: "memory_kerneltcp_max",
                help: "The kerneltcp maximum usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernelTCP() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.KernelTCP.Max),
                                },
                        }
                },
        },
        {
                name: "memory_kerneltcp_usage",
                help: "The kerneltcp usage",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.GetMemory().GetKernelTCP() == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.KernelTCP.Usage),
                                },
                        }
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

// IDName is the name that is used to identify the id being collected in the metric
var IDName = "container_id"

type value struct {
        v float64
        l []string
}

type metric struct {
        name   string
        help   string
        unit   metrics.Unit
        vt     prometheus.ValueType
        labels []string
        // getValues returns the value and labels for the data
        getValues func(stats *v1.Metrics) []value
}

func (m *metric) desc(ns *metrics.Namespace) *prometheus.Desc {
        // the namespace label is for containerd namespaces
        return ns.NewDesc(m.name, m.help, m.unit, append([]string{IDName, "namespace"}, m.labels...)...)
}

func (m *metric) collect(id, namespace string, stats *v1.Metrics, ns *metrics.Namespace, ch chan<- prometheus.Metric, block bool) {
        values := m.getValues(stats)
        for _, v := range values {
                // block signals to block on the sending the metrics so none are missed
                if block {
                        ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id, namespace}, v.l...)...)
                        continue
                }
                // non-blocking metrics can be dropped if the chan is full
                select {
                case ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id, namespace}, v.l...)...):
                default:
                }
        }
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        "context"
        "fmt"
        "sync"

        cgroups "github.com/containerd/cgroups/v3/cgroup1"
        cmetrics "github.com/containerd/containerd/v2/core/metrics"
        "github.com/containerd/containerd/v2/core/metrics/cgroups/common"
        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

// Trigger will be called when an event happens and provides the cgroup
// where the event originated from
type Trigger func(string, string, cgroups.Cgroup)

// NewCollector registers the collector with the provided namespace and returns it so
// that cgroups can be added for collection
func NewCollector(ns *metrics.Namespace) *Collector {
        if ns == nil {
                return &Collector{}
        }
        // add machine cpus and memory info
        c := &Collector{
                ns:    ns,
                tasks: make(map[string]entry),
        }
        c.metrics = append(c.metrics, pidMetrics...)
        c.metrics = append(c.metrics, cpuMetrics...)
        c.metrics = append(c.metrics, memoryMetrics...)
        c.metrics = append(c.metrics, hugetlbMetrics...)
        c.metrics = append(c.metrics, blkioMetrics...)
        c.storedMetrics = make(chan prometheus.Metric, 100*len(c.metrics))
        ns.Add(c)
        return c
}

func taskID(id, namespace string) string {
        return fmt.Sprintf("%s-%s", id, namespace)
}

type entry struct {
        task common.Statable
        // ns is an optional child namespace that contains additional to parent labels.
        // This can be used to append task specific labels to be able to differentiate the different containerd metrics.
        ns *metrics.Namespace
}

// Collector provides the ability to collect container stats and export
// them in the prometheus format
type Collector struct {
        ns            *metrics.Namespace
        storedMetrics chan prometheus.Metric

        // TODO(fuweid):
        //
        // The Collector.Collect will be the field ns'Collect's callback,
        // which be invoked periodically with internal lock. And Collector.Add
        // might also invoke ns.Lock if the labels is not nil, which is easy to
        // cause dead-lock.
        //
        // Goroutine X:
        //
        //        ns.Collect
        //             ns.Lock
        //          Collector.Collect
        //            Collector.RLock
        //
        //
        // Goroutine Y:
        //
        //        Collector.Add
        //        ...(RLock/Lock)
        //            ns.Lock
        //
        // I think we should seek the way to decouple ns from Collector.
        mu      sync.RWMutex
        tasks   map[string]entry
        metrics []*metric
}

// Describe prometheus metrics
func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
        for _, m := range c.metrics {
                ch <- m.desc(c.ns)
        }
}

// Collect prometheus metrics
func (c *Collector) Collect(ch chan<- prometheus.Metric) {
        c.mu.RLock()
        wg := &sync.WaitGroup{}
        for _, t := range c.tasks {
                wg.Add(1)
                go c.collect(t, ch, true, wg)
        }
storedLoop:
        for {
                // read stored metrics until the channel is flushed
                select {
                case m := <-c.storedMetrics:
                        ch <- m
                default:
                        break storedLoop
                }
        }
        c.mu.RUnlock()
        wg.Wait()
}

func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool, wg *sync.WaitGroup) {
        if wg != nil {
                defer wg.Done()
        }

        t := entry.task
        ctx, cancel := timeout.WithContext(context.Background(), cmetrics.ShimStatsRequestTimeout)
        stats, err := t.Stats(namespaces.WithNamespace(ctx, t.Namespace()))
        cancel()

        if err != nil {
                log.L.WithError(err).Errorf("stat task %s", t.ID())
                return
        }

        data, err := typeurl.UnmarshalAny(stats)
        if err != nil {
                log.L.WithError(err).Errorf("unmarshal stats for %s", t.ID())
                return
        }
        s, ok := data.(*v1.Metrics)
        if !ok {
                log.L.WithError(err).Errorf("invalid metric type for %s", t.ID())
                return
        }
        ns := entry.ns
        if ns == nil {
                ns = c.ns
        }
        for _, m := range c.metrics {
                m.collect(t.ID(), t.Namespace(), s, ns, ch, block)
        }
}

// Add adds the provided cgroup and id so that metrics are collected and exported
func (c *Collector) Add(t common.Statable, labels map[string]string) error {
        if c.ns == nil {
                return nil
        }
        c.mu.RLock()
        id := taskID(t.ID(), t.Namespace())
        _, ok := c.tasks[id]
        c.mu.RUnlock()
        if ok {
                return nil // requests to collect metrics should be idempotent
        }

        entry := entry{task: t}
        if labels != nil {
                entry.ns = c.ns.WithConstLabels(labels)
        }

        c.mu.Lock()
        c.tasks[id] = entry
        c.mu.Unlock()
        return nil
}

// Remove removes the provided cgroup by id from the collector
func (c *Collector) Remove(t common.Statable) {
        if c.ns == nil {
                return
        }
        c.mu.Lock()
        delete(c.tasks, taskID(t.ID(), t.Namespace()))
        c.mu.Unlock()
}

// RemoveAll statable items from the collector
func (c *Collector) RemoveAll() {
        if c.ns == nil {
                return
        }
        c.mu.Lock()
        c.tasks = make(map[string]entry)
        c.mu.Unlock()
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        "sync"
        "sync/atomic"

        "golang.org/x/sys/unix"

        cgroups "github.com/containerd/cgroups/v3/cgroup1"
        "github.com/containerd/containerd/v2/pkg/sys"
        "github.com/containerd/log"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

func newOOMCollector(ns *metrics.Namespace) (*oomCollector, error) {
        fd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC)
        if err != nil {
                return nil, err
        }
        var desc *prometheus.Desc
        if ns != nil {
                desc = ns.NewDesc("memory_oom", "The number of times a container has received an oom event", metrics.Total, "container_id", "namespace")
        }
        c := &oomCollector{
                fd:   fd,
                desc: desc,
                set:  make(map[uintptr]*oom),
        }
        if ns != nil {
                ns.Add(c)
        }
        go c.start()
        return c, nil
}

type oomCollector struct {
        mu sync.Mutex

        desc *prometheus.Desc
        fd   int
        set  map[uintptr]*oom
}

type oom struct {
        // count needs to stay the first member of this struct to ensure 64bits
        // alignment on a 32bits machine (e.g. arm32). This is necessary as we use
        // the sync/atomic operations on this field.
        count     int64
        id        string
        namespace string
        c         cgroups.Cgroup
        triggers  []Trigger
}

func (o *oomCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...Trigger) error {
        o.mu.Lock()
        defer o.mu.Unlock()
        fd, err := cg.OOMEventFD()
        if err != nil {
                return err
        }
        o.set[fd] = &oom{
                id:        id,
                c:         cg,
                triggers:  triggers,
                namespace: namespace,
        }
        event := unix.EpollEvent{
                Fd:     int32(fd),
                Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR,
        }
        return unix.EpollCtl(o.fd, unix.EPOLL_CTL_ADD, int(fd), &event)
}

func (o *oomCollector) Describe(ch chan<- *prometheus.Desc) {
        ch <- o.desc
}

func (o *oomCollector) Collect(ch chan<- prometheus.Metric) {
        o.mu.Lock()
        defer o.mu.Unlock()
        for _, t := range o.set {
                t := t
                c := atomic.LoadInt64(&t.count)
                ch <- prometheus.MustNewConstMetric(o.desc, prometheus.CounterValue, float64(c), t.id, t.namespace)
        }
}

// Close closes the epoll fd
func (o *oomCollector) Close() error {
        return unix.Close(o.fd)
}

func (o *oomCollector) start() {
        var (
                n      int
                err    error
                events [128]unix.EpollEvent
        )
        for {
                if err := sys.IgnoringEINTR(func() error {
                        n, err = unix.EpollWait(o.fd, events[:], -1)
                        return err
                }); err != nil {
                        log.L.WithError(err).Error("cgroups: epoll wait failed, OOM notifications disabled")
                        return
                }

                for i := 0; i < n; i++ {
                        o.process(uintptr(events[i].Fd))
                }
        }
}

func (o *oomCollector) process(fd uintptr) {
        // make sure to always flush the eventfd
        flushEventfd(fd)

        o.mu.Lock()
        info, ok := o.set[fd]
        if !ok {
                o.mu.Unlock()
                return
        }
        o.mu.Unlock()
        // if we received an event but it was caused by the cgroup being deleted and the fd
        // being closed make sure we close our copy and remove the container from the set
        if info.c.State() == cgroups.Deleted {
                o.mu.Lock()
                delete(o.set, fd)
                o.mu.Unlock()
                unix.Close(int(fd))
                return
        }
        atomic.AddInt64(&info.count, 1)
        for _, t := range info.triggers {
                t(info.id, info.namespace, info.c)
        }
}

func flushEventfd(efd uintptr) error {
        // Buffer must be >= 8 bytes for eventfd reads
        // https://man7.org/linux/man-pages/man2/eventfd.2.html
        var buf [8]byte
        _, err := unix.Read(int(efd), buf[:])
        return err
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v1

import (
        v1 "github.com/containerd/containerd/v2/core/metrics/types/v1"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var pidMetrics = []*metric{
        {
                name: "pids",
                help: "The limit to the number of pids allowed",
                unit: metrics.Unit("limit"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Pids == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Pids.Limit),
                                },
                        }
                },
        },
        {
                name: "pids",
                help: "The current number of pids",
                unit: metrics.Unit("current"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v1.Metrics) []value {
                        if stats.Pids == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Pids.Current),
                                },
                        }
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"

        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/docker/go-metrics"
)

// NewTaskMonitor returns a new cgroups monitor
func NewTaskMonitor(ctx context.Context, publisher events.Publisher, ns *metrics.Namespace) (runtime.TaskMonitor, error) {
        collector := NewCollector(ns)
        return &cgroupsMonitor{
                collector: collector,
                context:   ctx,
                publisher: publisher,
        }, nil
}

type cgroupsMonitor struct {
        collector *Collector
        context   context.Context
        publisher events.Publisher
}

func (m *cgroupsMonitor) Monitor(c runtime.Task, labels map[string]string) error {
        if err := m.collector.Add(c, labels); err != nil {
                return err
        }
        return nil
}

func (m *cgroupsMonitor) Stop(c runtime.Task) error {
        m.collector.Remove(c)
        return nil
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        v2 "github.com/containerd/containerd/v2/core/metrics/types/v2"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var cpuMetrics = []*metric{
        {
                name: "cpu_usage_usec",
                help: "Total cpu usage (cgroup v2)",
                unit: metrics.Unit("microseconds"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.UsageUsec),
                                },
                        }
                },
        },
        {
                name: "cpu_user_usec",
                help: "Current cpu usage in user space (cgroup v2)",
                unit: metrics.Unit("microseconds"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.UserUsec),
                                },
                        }
                },
        },
        {
                name: "cpu_system_usec",
                help: "Current cpu usage in kernel space (cgroup v2)",
                unit: metrics.Unit("microseconds"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.SystemUsec),
                                },
                        }
                },
        },
        {
                name: "cpu_nr_periods",
                help: "Current cpu number of periods (only if controller is enabled)",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.NrPeriods),
                                },
                        }
                },
        },
        {
                name: "cpu_nr_throttled",
                help: "Total number of times tasks have been throttled (only if controller is enabled)",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.NrThrottled),
                                },
                        }
                },
        },
        {
                name: "cpu_throttled_usec",
                help: "Total time duration for which tasks have been throttled. (only if controller is enabled)",
                unit: metrics.Unit("microseconds"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.CPU == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.CPU.ThrottledUsec),
                                },
                        }
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "strconv"

        v2 "github.com/containerd/containerd/v2/core/metrics/types/v2"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var ioMetrics = []*metric{
        {
                name:   "io_rbytes",
                help:   "IO bytes read",
                unit:   metrics.Bytes,
                vt:     prometheus.GaugeValue,
                labels: []string{"major", "minor"},
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Io == nil {
                                return nil
                        }
                        var out []value
                        for _, e := range stats.Io.Usage {
                                out = append(out, value{
                                        v: float64(e.Rbytes),
                                        l: []string{strconv.FormatUint(e.Major, 10), strconv.FormatUint(e.Minor, 10)},
                                })
                        }
                        return out
                },
        },
        {
                name:   "io_wbytes",
                help:   "IO bytes written",
                unit:   metrics.Bytes,
                vt:     prometheus.GaugeValue,
                labels: []string{"major", "minor"},
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Io == nil {
                                return nil
                        }
                        var out []value
                        for _, e := range stats.Io.Usage {
                                out = append(out, value{
                                        v: float64(e.Wbytes),
                                        l: []string{strconv.FormatUint(e.Major, 10), strconv.FormatUint(e.Minor, 10)},
                                })
                        }
                        return out
                },
        },
        {
                name:   "io_rios",
                help:   "Number of read IOs",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"major", "minor"},
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Io == nil {
                                return nil
                        }
                        var out []value
                        for _, e := range stats.Io.Usage {
                                out = append(out, value{
                                        v: float64(e.Rios),
                                        l: []string{strconv.FormatUint(e.Major, 10), strconv.FormatUint(e.Minor, 10)},
                                })
                        }
                        return out
                },
        },
        {
                name:   "io_wios",
                help:   "Number of write IOs",
                unit:   metrics.Total,
                vt:     prometheus.GaugeValue,
                labels: []string{"major", "minor"},
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Io == nil {
                                return nil
                        }
                        var out []value
                        for _, e := range stats.Io.Usage {
                                out = append(out, value{
                                        v: float64(e.Wios),
                                        l: []string{strconv.FormatUint(e.Major, 10), strconv.FormatUint(e.Minor, 10)},
                                })
                        }
                        return out
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        v2 "github.com/containerd/containerd/v2/core/metrics/types/v2"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var memoryMetrics = []*metric{
        {
                name: "memory_usage",
                help: "Current memory usage (cgroup v2)",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Usage),
                                },
                        }
                },
        },
        {
                name: "memory_usage_limit",
                help: "Current memory usage limit (cgroup v2)",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.UsageLimit),
                                },
                        }
                },
        },
        {
                name: "memory_swap_usage",
                help: "Current swap usage (cgroup v2)",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.SwapUsage),
                                },
                        }
                },
        },
        {
                name: "memory_swap_limit",
                help: "Current swap usage limit (cgroup v2)",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.SwapLimit),
                                },
                        }
                },
        },

        {
                name: "memory_file_mapped",
                help: "The file_mapped amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.FileMapped),
                                },
                        }
                },
        },
        {
                name: "memory_file_dirty",
                help: "The file_dirty amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.FileDirty),
                                },
                        }
                },
        },
        {
                name: "memory_file_writeback",
                help: "The file_writeback amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.FileWriteback),
                                },
                        }
                },
        },
        {
                name: "memory_pgactivate",
                help: "The pgactivate amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgactivate),
                                },
                        }
                },
        },
        {
                name: "memory_pgdeactivate",
                help: "The pgdeactivate amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgdeactivate),
                                },
                        }
                },
        },
        {
                name: "memory_pgfault",
                help: "The pgfault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgfault),
                                },
                        }
                },
        },
        {
                name: "memory_pgmajfault",
                help: "The pgmajfault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgmajfault),
                                },
                        }
                },
        },
        {
                name: "memory_pglazyfree",
                help: "The pglazyfree amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pglazyfree),
                                },
                        }
                },
        },
        {
                name: "memory_pgrefill",
                help: "The pgrefill amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgrefill),
                                },
                        }
                },
        },
        {
                name: "memory_pglazyfreed",
                help: "The pglazyfreed amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pglazyfreed),
                                },
                        }
                },
        },
        {
                name: "memory_pgscan",
                help: "The pgscan amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgscan),
                                },
                        }
                },
        },
        {
                name: "memory_pgsteal",
                help: "The pgsteal amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Pgsteal),
                                },
                        }
                },
        },
        {
                name: "memory_inactive_anon",
                help: "The inactive_anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.InactiveAnon),
                                },
                        }
                },
        },
        {
                name: "memory_active_anon",
                help: "The active_anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.ActiveAnon),
                                },
                        }
                },
        },
        {
                name: "memory_inactive_file",
                help: "The inactive_file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.InactiveFile),
                                },
                        }
                },
        },
        {
                name: "memory_active_file",
                help: "The active_file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.ActiveFile),
                                },
                        }
                },
        },
        {
                name: "memory_unevictable",
                help: "The unevictable amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Unevictable),
                                },
                        }
                },
        },
        {
                name: "memory_anon",
                help: "The anon amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Anon),
                                },
                        }
                },
        },
        {
                name: "memory_file",
                help: "The file amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.File),
                                },
                        }
                },
        },
        {
                name: "memory_kernel_stack",
                help: "The kernel_stack amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.KernelStack),
                                },
                        }
                },
        },
        {
                name: "memory_slab",
                help: "The slab amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Slab),
                                },
                        }
                },
        },
        {
                name: "memory_sock",
                help: "The sock amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Sock),
                                },
                        }
                },
        },
        {
                name: "memory_shmem",
                help: "The shmem amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.Shmem),
                                },
                        }
                },
        },
        {
                name: "memory_anon_thp",
                help: "The anon_thp amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.AnonThp),
                                },
                        }
                },
        },
        {
                name: "memory_slab_reclaimable",
                help: "The slab_reclaimable amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.SlabReclaimable),
                                },
                        }
                },
        },
        {
                name: "memory_slab_unreclaimable",
                help: "The slab_unreclaimable amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.SlabUnreclaimable),
                                },
                        }
                },
        },
        {
                name: "memory_workingset_refault",
                help: "The workingset_refault amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.WorkingsetRefault),
                                },
                        }
                },
        },
        {
                name: "memory_workingset_activate",
                help: "The workingset_activate amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.WorkingsetActivate),
                                },
                        }
                },
        },
        {
                name: "memory_workingset_nodereclaim",
                help: "The workingset_nodereclaim amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.WorkingsetNodereclaim),
                                },
                        }
                },
        },
        {
                name: "memory_thp_fault_alloc",
                help: "The thp_fault_alloc amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.ThpFaultAlloc),
                                },
                        }
                },
        },
        {
                name: "memory_thp_collapse_alloc",
                help: "The thp_collapse_alloc amount",
                unit: metrics.Bytes,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Memory == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Memory.ThpCollapseAlloc),
                                },
                        }
                },
        },
        {
                name: "memory_oom",
                help: "The number of times a container has received an oom event",
                unit: metrics.Total,
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.MemoryEvents == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.MemoryEvents.Oom),
                                },
                        }
                },
        },
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        v2 "github.com/containerd/containerd/v2/core/metrics/types/v2"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

// IDName is the name that is used to identify the id being collected in the metric
var IDName = "container_id"

type value struct {
        v float64
        l []string
}

type metric struct {
        name   string
        help   string
        unit   metrics.Unit
        vt     prometheus.ValueType
        labels []string
        // getValues returns the value and labels for the data
        getValues func(stats *v2.Metrics) []value
}

func (m *metric) desc(ns *metrics.Namespace) *prometheus.Desc {
        // the namespace label is for containerd namespaces
        return ns.NewDesc(m.name, m.help, m.unit, append([]string{IDName, "namespace"}, m.labels...)...)
}

func (m *metric) collect(id, namespace string, stats *v2.Metrics, ns *metrics.Namespace, ch chan<- prometheus.Metric, block bool) {
        values := m.getValues(stats)
        for _, v := range values {
                // block signals to block on the sending the metrics so none are missed
                if block {
                        ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id, namespace}, v.l...)...)
                        continue
                }
                // non-blocking metrics can be dropped if the chan is full
                select {
                case ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id, namespace}, v.l...)...):
                default:
                }
        }
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "fmt"
        "sync"

        cmetrics "github.com/containerd/containerd/v2/core/metrics"
        "github.com/containerd/containerd/v2/core/metrics/cgroups/common"
        v2 "github.com/containerd/containerd/v2/core/metrics/types/v2"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

// NewCollector registers the collector with the provided namespace and returns it so
// that cgroups can be added for collection
func NewCollector(ns *metrics.Namespace) *Collector {
        if ns == nil {
                return &Collector{}
        }
        c := &Collector{
                ns:    ns,
                tasks: make(map[string]entry),
        }
        c.metrics = append(c.metrics, pidMetrics...)
        c.metrics = append(c.metrics, cpuMetrics...)
        c.metrics = append(c.metrics, memoryMetrics...)
        c.metrics = append(c.metrics, ioMetrics...)
        c.storedMetrics = make(chan prometheus.Metric, 100*len(c.metrics))
        ns.Add(c)
        return c
}

func taskID(id, namespace string) string {
        return fmt.Sprintf("%s-%s", id, namespace)
}

type entry struct {
        task common.Statable
        // ns is an optional child namespace that contains additional to parent labels.
        // This can be used to append task specific labels to be able to differentiate the different containerd metrics.
        ns *metrics.Namespace
}

// Collector provides the ability to collect container stats and export
// them in the prometheus format
type Collector struct {
        ns            *metrics.Namespace
        storedMetrics chan prometheus.Metric

        // TODO(fuweid):
        //
        // The Collector.Collect will be the field ns'Collect's callback,
        // which be invoked periodically with internal lock. And Collector.Add
        // might also invoke ns.Lock if the labels is not nil, which is easy to
        // cause dead-lock.
        //
        // Goroutine X:
        //
        //        ns.Collect
        //             ns.Lock
        //          Collector.Collect
        //            Collector.RLock
        //
        //
        // Goroutine Y:
        //
        //        Collector.Add
        //        ...(RLock/Lock)
        //            ns.Lock
        //
        // I think we should seek the way to decouple ns from Collector.
        mu      sync.RWMutex
        tasks   map[string]entry
        metrics []*metric
}

// Describe prometheus metrics
func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
        for _, m := range c.metrics {
                ch <- m.desc(c.ns)
        }
}

// Collect prometheus metrics
func (c *Collector) Collect(ch chan<- prometheus.Metric) {
        c.mu.RLock()
        wg := &sync.WaitGroup{}
        for _, t := range c.tasks {
                wg.Add(1)
                go c.collect(t, ch, true, wg)
        }
storedLoop:
        for {
                // read stored metrics until the channel is flushed
                select {
                case m := <-c.storedMetrics:
                        ch <- m
                default:
                        break storedLoop
                }
        }
        c.mu.RUnlock()
        wg.Wait()
}

func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool, wg *sync.WaitGroup) {
        if wg != nil {
                defer wg.Done()
        }

        t := entry.task
        ctx, cancel := timeout.WithContext(context.Background(), cmetrics.ShimStatsRequestTimeout)
        stats, err := t.Stats(namespaces.WithNamespace(ctx, t.Namespace()))
        cancel()

        if err != nil {
                log.L.WithError(err).Errorf("stat task %s", t.ID())
                return
        }

        data, err := typeurl.UnmarshalAny(stats)
        if err != nil {
                log.L.WithError(err).Errorf("unmarshal stats for %s", t.ID())
                return
        }
        s, ok := data.(*v2.Metrics)
        if !ok {
                log.L.WithError(err).Errorf("invalid metric type for %s", t.ID())
                return
        }
        ns := entry.ns
        if ns == nil {
                ns = c.ns
        }
        for _, m := range c.metrics {
                m.collect(t.ID(), t.Namespace(), s, ns, ch, block)
        }
}

// Add adds the provided cgroup and id so that metrics are collected and exported
func (c *Collector) Add(t common.Statable, labels map[string]string) error {
        if c.ns == nil {
                return nil
        }
        c.mu.RLock()
        id := taskID(t.ID(), t.Namespace())
        _, ok := c.tasks[id]
        c.mu.RUnlock()
        if ok {
                return nil // requests to collect metrics should be idempotent
        }
        entry := entry{task: t}
        if labels != nil {
                entry.ns = c.ns.WithConstLabels(labels)
        }
        c.mu.Lock()
        c.tasks[id] = entry
        c.mu.Unlock()
        return nil
}

// Remove removes the provided cgroup by id from the collector
func (c *Collector) Remove(t common.Statable) {
        if c.ns == nil {
                return
        }
        c.mu.Lock()
        defer c.mu.Unlock()
        delete(c.tasks, taskID(t.ID(), t.Namespace()))
}

// RemoveAll statable items from the collector
func (c *Collector) RemoveAll() {
        if c.ns == nil {
                return
        }
        c.mu.Lock()
        c.tasks = make(map[string]entry)
        c.mu.Unlock()
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        v2 "github.com/containerd/containerd/v2/core/metrics/types/v2"
        metrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var pidMetrics = []*metric{
        {
                name: "pids",
                help: "The limit to the number of pids allowed (cgroup v2)",
                unit: metrics.Unit("limit"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Pids == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Pids.Limit),
                                },
                        }
                },
        },
        {
                name: "pids",
                help: "The current number of pids (cgroup v2)",
                unit: metrics.Unit("current"),
                vt:   prometheus.GaugeValue,
                getValues: func(stats *v2.Metrics) []value {
                        if stats.Pids == nil {
                                return nil
                        }
                        return []value{
                                {
                                        v: float64(stats.Pids.Current),
                                },
                        }
                },
        },
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package metrics

import (
        "time"

        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/containerd/v2/version"
        goMetrics "github.com/docker/go-metrics"
)

const (
        ShimStatsRequestTimeout = "io.containerd.timeout.metrics.shimstats"
)

func init() {
        ns := goMetrics.NewNamespace("containerd", "", nil)
        c := ns.NewLabeledCounter("build_info", "containerd build information", "version", "revision")
        c.WithValues(version.Version, version.Revision).Inc()
        goMetrics.Register(ns)
        timeout.Set(ShimStatsRequestTimeout, 2*time.Second)
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "fmt"

        "github.com/moby/sys/mountinfo"
)

// Lookup returns the mount info corresponds to the path.
func Lookup(dir string) (Info, error) {
        resolvedDir, err := CanonicalizePath(dir)
        if err != nil {
                return Info{}, err
        }

        m, err := mountinfo.GetMounts(mountinfo.ParentsFilter(resolvedDir))
        if err != nil {
                return Info{}, fmt.Errorf("failed to find the mount info for %q: %w", resolvedDir, err)
        }
        if len(m) == 0 {
                return Info{}, fmt.Errorf("failed to find the mount info for %q", resolvedDir)
        }

        // find the longest matching mount point
        var idx, maxlen int
        for i := range m {
                if len(m[i].Mountpoint) > maxlen {
                        maxlen = len(m[i].Mountpoint)
                        idx = i
                }
        }
        return *m[idx], nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "errors"
        "fmt"
        "os"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/internal/randutil"
        kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
        "golang.org/x/sys/unix"
)

const (
        loopControlPath = "/dev/loop-control"
        loopDevFormat   = "/dev/loop%d"

        ebusyString = "device or resource busy"
)

// LoopParams parameters to control loop device setup
type LoopParams struct {
        // Loop device should forbid write
        Readonly bool
        // Loop device is automatically cleared by kernel when the
        // last opener closes it
        Autoclear bool
        // Use direct IO to access the loop backing file
        Direct bool
}

func getFreeLoopDev() (uint32, error) {
        ctrl, err := os.OpenFile(loopControlPath, os.O_RDWR, 0)
        if err != nil {
                return 0, fmt.Errorf("could not open %v: %v", loopControlPath, err)
        }
        defer ctrl.Close()
        num, err := unix.IoctlRetInt(int(ctrl.Fd()), unix.LOOP_CTL_GET_FREE)
        if err != nil {
                return 0, fmt.Errorf("could not get free loop device: %w", err)
        }
        return uint32(num), nil
}

// setupLoopDev attaches the backing file to the loop device and returns
// the file handle for the loop device. The caller is responsible for
// closing the file handle.
func setupLoopDev(backingFile, loopDev string, param LoopParams) (_ *os.File, retErr error) {
        // 1. Open backing file and loop device
        flags := os.O_RDWR
        if param.Readonly {
                flags = os.O_RDONLY
        }

        back, err := os.OpenFile(backingFile, flags, 0)
        if err != nil {
                return nil, fmt.Errorf("could not open backing file: %s: %w", backingFile, err)
        }
        defer back.Close()

        loop, err := os.OpenFile(loopDev, flags, 0)
        if err != nil {
                return nil, fmt.Errorf("could not open loop device: %s: %w", loopDev, err)
        }
        defer func() {
                if retErr != nil {
                        loop.Close()
                }
        }()

        fiveDotEight := kernel.KernelVersion{Kernel: 5, Major: 8}
        if ok, err := kernel.GreaterEqualThan(fiveDotEight); err == nil && ok {
                config := unix.LoopConfig{
                        Fd: uint32(back.Fd()),
                }

                copy(config.Info.File_name[:], backingFile)
                if param.Readonly {
                        config.Info.Flags |= unix.LO_FLAGS_READ_ONLY
                }

                if param.Autoclear {
                        config.Info.Flags |= unix.LO_FLAGS_AUTOCLEAR
                }

                if param.Direct {
                        config.Info.Flags |= unix.LO_FLAGS_DIRECT_IO
                }

                if err := unix.IoctlLoopConfigure(int(loop.Fd()), &config); err != nil {
                        return nil, fmt.Errorf("failed to configure loop device: %s: %w", loopDev, err)
                }

                return loop, nil
        }

        // 2. Set FD
        if err := unix.IoctlSetInt(int(loop.Fd()), unix.LOOP_SET_FD, int(back.Fd())); err != nil {
                return nil, fmt.Errorf("could not set loop fd for device: %s: %w", loopDev, err)
        }

        defer func() {
                if retErr != nil {
                        _ = unix.IoctlSetInt(int(loop.Fd()), unix.LOOP_CLR_FD, 0)
                }
        }()

        // 3. Set Info
        info := unix.LoopInfo64{}
        copy(info.File_name[:], backingFile)
        if param.Readonly {
                info.Flags |= unix.LO_FLAGS_READ_ONLY
        }

        if param.Autoclear {
                info.Flags |= unix.LO_FLAGS_AUTOCLEAR
        }

        err = unix.IoctlLoopSetStatus64(int(loop.Fd()), &info)
        if err != nil {
                return nil, fmt.Errorf("failed to set loop device info: %w", err)
        }

        // 4. Set Direct IO
        if param.Direct {
                err = unix.IoctlSetInt(int(loop.Fd()), unix.LOOP_SET_DIRECT_IO, 1)
                if err != nil {
                        return nil, fmt.Errorf("failed to setup loop with direct: %w", err)
                }
        }

        return loop, nil
}

// setupLoop looks for (and possibly creates) a free loop device, and
// then attaches backingFile to it.
//
// When autoclear is true, caller should take care to close it when
// done with the loop device. The loop device file handle keeps
// loFlagsAutoclear in effect and we rely on it to clean up the loop
// device. If caller closes the file handle after mounting the device,
// kernel will clear the loop device after it is umounted. Otherwise
// the loop device is cleared when the file handle is closed.
//
// When autoclear is false, caller should be responsible to remove
// the loop device when done with it.
//
// Upon success, the file handle to the loop device is returned.
func setupLoop(backingFile string, param LoopParams) (*os.File, error) {
        for retry := 1; retry < 100; retry++ {
                num, err := getFreeLoopDev()
                if err != nil {
                        return nil, err
                }

                loopDev := fmt.Sprintf(loopDevFormat, num)
                file, err := setupLoopDev(backingFile, loopDev, param)
                if err != nil {
                        // Per util-linux/sys-utils/losetup.c:create_loop(),
                        // free loop device can race and we end up failing
                        // with EBUSY when trying to set it up.
                        if strings.Contains(err.Error(), ebusyString) {
                                // Fallback a bit to avoid live lock
                                time.Sleep(time.Millisecond * time.Duration(randutil.Intn(retry*10)))
                                continue
                        }
                        return nil, err
                }

                return file, nil
        }

        return nil, errors.New("timeout creating new loopback device")
}

func removeLoop(loopdev string) error {
        file, err := os.Open(loopdev)
        if err != nil {
                return err
        }
        defer file.Close()

        return unix.IoctlSetInt(int(file.Fd()), unix.LOOP_CLR_FD, 0)
}

// AttachLoopDevice attaches a specified backing file to a loop device
func AttachLoopDevice(backingFile string) (string, error) {
        file, err := setupLoop(backingFile, LoopParams{})
        if err != nil {
                return "", err
        }
        defer file.Close()
        return file.Name(), nil
}

// DetachLoopDevice detaches the provided loop devices
func DetachLoopDevice(devices ...string) error {
        for _, dev := range devices {
                if err := removeLoop(dev); err != nil {
                        return fmt.Errorf("failed to remove loop device: %s: %w", dev, err)
                }
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "fmt"
        "path/filepath"
        "strings"

        "github.com/containerd/containerd/api/types"
        "github.com/containerd/continuity/fs"
)

// Mount is the lingua franca of containerd. A mount represents a
// serialized mount syscall. Components either emit or consume mounts.
type Mount struct {
        // Type specifies the host-specific of the mount.
        Type string
        // Source specifies where to mount from. Depending on the host system, this
        // can be a source path or device.
        Source string
        // Target specifies an optional subdirectory as a mountpoint. It assumes that
        // the subdirectory exists in a parent mount.
        Target string
        // Options contains zero or more fstab-style mount options. Typically,
        // these are platform specific.
        Options []string
}

// All mounts all the provided mounts to the provided target. If submounts are
// present, it assumes that parent mounts come before child mounts.
func All(mounts []Mount, target string) error {
        for _, m := range mounts {
                if err := m.Mount(target); err != nil {
                        return err
                }
        }
        return nil
}

// UnmountMounts unmounts all the mounts under a target in the reverse order of
// the mounts array provided.
func UnmountMounts(mounts []Mount, target string, flags int) error {
        for i := len(mounts) - 1; i >= 0; i-- {
                mountpoint, err := fs.RootPath(target, mounts[i].Target)
                if err != nil {
                        return err
                }

                if err := UnmountAll(mountpoint, flags); err != nil {
                        if i == len(mounts)-1 { // last mount
                                return err
                        }
                }
        }
        return nil
}

// CanonicalizePath makes path absolute and resolves symlinks in it.
// Path must exist.
func CanonicalizePath(path string) (string, error) {
        // Abs also does Clean, so we do not need to call it separately
        path, err := filepath.Abs(path)
        if err != nil {
                return "", err
        }

        return filepath.EvalSymlinks(path)
}

// ReadOnly returns a boolean value indicating whether this mount has the "ro"
// option set.
func (m *Mount) ReadOnly() bool {
        for _, option := range m.Options {
                if option == "ro" {
                        return true
                }
        }
        return false
}

// Mount to the provided target path.
func (m *Mount) Mount(target string) error {
        target, err := fs.RootPath(target, m.Target)
        if err != nil {
                return fmt.Errorf("failed to join path %q with root %q: %w", m.Target, target, err)
        }
        return m.mount(target)
}

// readonlyMounts modifies the received mount options
// to make them readonly
func readonlyMounts(mounts []Mount) []Mount {
        for i, m := range mounts {
                if m.Type == "overlay" {
                        mounts[i].Options = readonlyOverlay(m.Options)
                        continue
                }
                opts := make([]string, 0, len(m.Options))
                for _, opt := range m.Options {
                        if opt != "rw" && opt != "ro" { // skip `ro` too so we don't append it twice
                                opts = append(opts, opt)
                        }
                }
                opts = append(opts, "ro")
                mounts[i].Options = opts
        }
        return mounts
}

// readonlyOverlay takes mount options for overlay mounts and makes them readonly by
// removing workdir and upperdir (and appending the upperdir layer to lowerdir) - see:
// https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html#multiple-lower-layers
func readonlyOverlay(opt []string) []string {
        out := make([]string, 0, len(opt))
        upper := ""
        for _, o := range opt {
                if strings.HasPrefix(o, "upperdir=") {
                        upper = strings.TrimPrefix(o, "upperdir=")
                } else if !strings.HasPrefix(o, "workdir=") {
                        out = append(out, o)
                }
        }
        if upper != "" {
                for i, o := range out {
                        if strings.HasPrefix(o, "lowerdir=") {
                                out[i] = "lowerdir=" + upper + ":" + strings.TrimPrefix(o, "lowerdir=")
                        }
                }
        }
        return out
}

// ToProto converts from [Mount] to the containerd
// APIs protobuf definition of a Mount.
func ToProto(mounts []Mount) []*types.Mount {
        apiMounts := make([]*types.Mount, len(mounts))
        for i, m := range mounts {
                apiMounts[i] = &types.Mount{
                        Type:    m.Type,
                        Source:  m.Source,
                        Target:  m.Target,
                        Options: m.Options,
                }
        }
        return apiMounts
}

// FromProto converts from the protobuf definition [types.Mount] to
// [Mount].
func FromProto(mm []*types.Mount) []Mount {
        mounts := make([]Mount, len(mm))
        for i, m := range mm {
                mounts[i] = Mount{
                        Type:    m.Type,
                        Source:  m.Source,
                        Target:  m.Target,
                        Options: m.Options,
                }
        }
        return mounts
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "fmt"
        "os"
        "runtime"
        "strconv"
        "strings"
        "sync"
        "syscall"

        "golang.org/x/sys/unix"

        "github.com/containerd/containerd/v2/pkg/sys"
)

// TODO: Support multiple mappings in future
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
        parts := strings.Split(mapping, ":")
        if len(parts) != 3 {
                return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
        }

        cID, err := strconv.Atoi(parts[0])
        if err != nil {
                return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
        }

        hID, err := strconv.Atoi(parts[1])
        if err != nil {
                return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
        }

        size, err := strconv.Atoi(parts[2])
        if err != nil {
                return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
        }

        if cID < 0 || hID < 0 || size < 0 {
                return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
        }

        return []syscall.SysProcIDMap{
                {
                        ContainerID: cID,
                        HostID:      hID,
                        Size:        size,
                },
        }, nil
}

// IDMapMount applies GID/UID shift according to gidmap/uidmap for target path
func IDMapMount(source, target string, usernsFd int) (err error) {
        var (
                attr unix.MountAttr
        )

        attr.Attr_set = unix.MOUNT_ATTR_IDMAP
        attr.Attr_clr = 0
        attr.Propagation = 0
        attr.Userns_fd = uint64(usernsFd)

        dFd, err := unix.OpenTree(-int(unix.EBADF), source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH))
        if err != nil {
                return fmt.Errorf("Unable to open tree for %s: %w", target, err)
        }

        defer unix.Close(dFd)
        if err = unix.MountSetattr(dFd, "", unix.AT_EMPTY_PATH, &attr); err != nil {
                return fmt.Errorf("Unable to shift GID/UID for %s: %w", target, err)
        }

        if err = unix.MoveMount(dFd, "", -int(unix.EBADF), target, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
                return fmt.Errorf("Unable to attach mount tree to %s: %w", target, err)
        }
        return nil
}

// GetUsernsFD forks the current process and creates a user namespace using
// the specified mappings.
func GetUsernsFD(uidmap, gidmap string) (_usernsFD *os.File, _ error) {
        uidMaps, err := parseIDMapping(uidmap)
        if err != nil {
                return nil, err
        }

        gidMaps, err := parseIDMapping(gidmap)
        if err != nil {
                return nil, err
        }
        return getUsernsFD(uidMaps, gidMaps)
}

func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_usernsFD *os.File, retErr error) {
        runtime.LockOSThread()
        defer runtime.UnlockOSThread()

        pid, pidfd, errno := sys.ForkUserns()
        if errno != 0 {
                return nil, errno
        }

        pidFD := os.NewFile(pidfd, "pidfd")
        defer func() {
                unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)

                pidfdWaitid(pidFD)

                pidFD.Close()
        }()

        // NOTE:
        //
        // The usernsFD will hold the userns reference in kernel. Even if the
        // child process is reaped, the usernsFD is still valid.
        usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid))
        if err != nil {
                return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", pid, err)
        }
        defer func() {
                if retErr != nil {
                        usernsFD.Close()
                }
        }()

        uidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "uid_map"), os.O_WRONLY, 0600)
        if err != nil {
                return nil, fmt.Errorf("failed to open /proc/%d/uid_map: %w", pid, err)
        }
        defer uidmapFile.Close()

        gidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "gid_map"), os.O_WRONLY, 0600)
        if err != nil {
                return nil, fmt.Errorf("failed to open /proc/%d/gid_map: %w", pid, err)
        }
        defer gidmapFile.Close()

        testHookKillChildBeforePidfdSendSignal(pid, pidFD)

        // Ensure the child process is still alive. If the err is ESRCH, we
        // should return error because we can't guarantee the usernsFD and
        // u[g]idmapFile are valid. It's safe to return error and retry.
        if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil {
                return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
        }

        testHookKillChildAfterPidfdSendSignal(pid, pidFD)

        // NOTE:
        //
        // The u[g]id_map file descriptor is still valid if the child process
        // is reaped.
        writeMappings := func(f *os.File, idmap []syscall.SysProcIDMap) error {
                mappings := ""
                for _, m := range idmap {
                        mappings = fmt.Sprintf("%s%d %d %d\n", mappings, m.ContainerID, m.HostID, m.Size)
                }

                _, err := f.Write([]byte(mappings))
                if err1 := f.Close(); err1 != nil && err == nil {
                        err = err1
                }
                return err
        }

        if err := writeMappings(uidmapFile, uidMaps); err != nil {
                return nil, fmt.Errorf("failed to write uid_map: %w", err)
        }

        if err := writeMappings(gidmapFile, gidMaps); err != nil {
                return nil, fmt.Errorf("failed to write gid_map: %w", err)
        }
        return usernsFD, nil
}

func pidfdWaitid(pidFD *os.File) error {
        return sys.IgnoringEINTR(func() error {
                return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil)
        })
}

var (
        testHookLock sync.Mutex

        testHookKillChildBeforePidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {}

        testHookKillChildAfterPidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {}
)

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "errors"
        "fmt"
        "os"
        "os/exec"
        "path"
        "path/filepath"
        "runtime"
        "strconv"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/pkg/userns"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

type mountOpt struct {
        flags   int
        data    []string
        losetup bool
        uidmap  string
        gidmap  string
}

var (
        pagesize              = 4096
        allowedHelperBinaries = []string{"mount.fuse", "mount.fuse3"}
)

func init() {
        pagesize = os.Getpagesize()
}

// prepareIDMappedOverlay is a helper function to obtain
// actual "lowerdir=..." mount options. It creates and
// applies id mapping for each lowerdir.
//
// It returns:
//  1. New options that include new "lowedir=..." mount option.
//  2. "Clean up" function -- it should be called as a defer one before
//     checking for error, because if do the second and avoid calling "clean up",
//     you're going to have "dirty" setup -- there's no guarantee that those
//     temporary mount points for lowedirs will be cleaned properly.
//  3. Error -- nil if everything's fine, otherwise an error.
func prepareIDMappedOverlay(usernsFd int, options []string) ([]string, func(), error) {
        lowerIdx, lowerDirs := findOverlayLowerdirs(options)
        if lowerIdx == -1 {
                return options, nil, fmt.Errorf("failed to parse overlay lowerdir's from given options")
        }

        tmpLowerdirs, idMapCleanUp, err := doPrepareIDMappedOverlay(lowerDirs, usernsFd)
        if err != nil {
                return options, idMapCleanUp, fmt.Errorf("failed to create idmapped mount: %w", err)
        }

        options = append(options[:lowerIdx], options[lowerIdx+1:]...)
        options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(tmpLowerdirs, ":")))

        return options, idMapCleanUp, nil
}

// Mount to the provided target path.
//
// If m.Type starts with "fuse." or "fuse3.", "mount.fuse" or "mount.fuse3"
// helper binary is called.
func (m *Mount) mount(target string) (err error) {
        for _, helperBinary := range allowedHelperBinaries {
                // helperBinary = "mount.fuse", typePrefix = "fuse."
                typePrefix := strings.TrimPrefix(helperBinary, "mount.") + "."
                if strings.HasPrefix(m.Type, typePrefix) {
                        return m.mountWithHelper(helperBinary, typePrefix, target)
                }
        }
        var (
                chdir     string
                recalcOpt bool
                usernsFd  *os.File
                options   = m.Options
        )

        opt := parseMountOptions(options)
        // The only remapping of both GID and UID is supported
        if opt.uidmap != "" && opt.gidmap != "" {
                if usernsFd, err = GetUsernsFD(opt.uidmap, opt.gidmap); err != nil {
                        return err
                }
                defer usernsFd.Close()

                // overlay expects lowerdir's to be remapped instead
                if m.Type == "overlay" {
                        var (
                                userNsCleanUp func()
                        )
                        options, userNsCleanUp, err = prepareIDMappedOverlay(int(usernsFd.Fd()), options)
                        defer userNsCleanUp()

                        if err != nil {
                                return fmt.Errorf("failed to prepare idmapped overlay: %w", err)
                        }
                        // To not meet concurrency issues while using the same lowedirs
                        // for different containers, replace them by temporary directories,
                        if optionsSize(options) >= pagesize-512 {
                                recalcOpt = true
                        } else {
                                opt = parseMountOptions(options)
                        }
                }
        }

        // avoid hitting one page limit of mount argument buffer
        //
        // NOTE: 512 is a buffer during pagesize check.
        if m.Type == "overlay" && optionsSize(options) >= pagesize-512 {
                chdir, options = compactLowerdirOption(options)
                // recalculate opt in case of lowerdirs have been replaced
                // by idmapped ones OR idmapped mounts' not used/supported.
                if recalcOpt || (opt.uidmap == "" || opt.gidmap == "") {
                        opt = parseMountOptions(options)
                }
        }

        // propagation types.
        const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE

        // Ensure propagation type change flags aren't included in other calls.
        oflags := opt.flags &^ ptypes

        var loopParams LoopParams
        if opt.losetup {
                loopParams = LoopParams{
                        Readonly:  oflags&unix.MS_RDONLY == unix.MS_RDONLY,
                        Autoclear: true,
                }
                loopParams.Direct, opt.data = hasDirectIO(opt.data)
        }

        dataInStr := strings.Join(opt.data, ",")
        if len(dataInStr) > pagesize {
                return errors.New("mount options is too long")
        }

        // In the case of remounting with changed data (dataInStr != ""), need to call mount (moby/moby#34077).
        if opt.flags&unix.MS_REMOUNT == 0 || dataInStr != "" {
                // Initial call applying all non-propagation flags for mount
                // or remount with changed data
                source := m.Source
                if opt.losetup {
                        loFile, err := setupLoop(m.Source, loopParams)
                        if err != nil {
                                return err
                        }
                        defer loFile.Close()

                        // Mount the loop device instead
                        source = loFile.Name()
                }
                if err := mountAt(chdir, source, target, m.Type, uintptr(oflags), dataInStr); err != nil {
                        return err
                }
        }

        if opt.flags&ptypes != 0 {
                // Change the propagation type.
                const pflags = ptypes | unix.MS_REC | unix.MS_SILENT
                if err := unix.Mount("", target, "", uintptr(opt.flags&pflags), ""); err != nil {
                        return err
                }
        }

        const broflags = unix.MS_BIND | unix.MS_RDONLY
        if oflags&broflags == broflags {
                // Preserve CL_UNPRIVILEGED "locked" flags of the
                // bind mount target when we remount to make the bind readonly.
                // This is necessary to ensure that
                // bind-mounting "with options" will not fail with user namespaces, due to
                // kernel restrictions that require user namespace mounts to preserve
                // CL_UNPRIVILEGED locked flags.
                var unprivFlags int
                if userns.RunningInUserNS() {
                        unprivFlags, err = getUnprivilegedMountFlags(target)
                        if err != nil {
                                return err
                        }
                }
                // Remount the bind to apply read only.
                return unix.Mount("", target, "", uintptr(oflags|unprivFlags|unix.MS_REMOUNT), "")
        }

        // remap non-overlay mount point
        if opt.uidmap != "" && opt.gidmap != "" && m.Type != "overlay" {
                if err := IDMapMount(target, target, int(usernsFd.Fd())); err != nil {
                        return err
                }
        }
        return nil
}

// Get the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED.
//
// From https://github.com/moby/moby/blob/v23.0.1/daemon/oci_linux.go#L430-L460
func getUnprivilegedMountFlags(path string) (int, error) {
        var statfs unix.Statfs_t
        if err := unix.Statfs(path, &statfs); err != nil {
                return 0, err
        }

        // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
        unprivilegedFlags := []int{
                unix.MS_RDONLY,
                unix.MS_NODEV,
                unix.MS_NOEXEC,
                unix.MS_NOSUID,
                unix.MS_NOATIME,
                unix.MS_RELATIME,
                unix.MS_NODIRATIME,
        }

        var flags int
        for flag := range unprivilegedFlags {
                if int(statfs.Flags)&flag == flag {
                        flags |= flag
                }
        }

        return flags, nil
}

func doPrepareIDMappedOverlay(lowerDirs []string, usernsFd int) (tmpLowerDirs []string, _ func(), _ error) {
        td, err := os.MkdirTemp(tempMountLocation, "ovl-idmapped")
        if err != nil {
                return nil, nil, err
        }
        cleanUp := func() {
                for _, lowerDir := range tmpLowerDirs {
                        if err := unix.Unmount(lowerDir, 0); err != nil {
                                logrus.WithError(err).Warnf("failed to unmount temp lowerdir %s", lowerDir)
                        }
                }
                if terr := os.RemoveAll(filepath.Clean(filepath.Join(tmpLowerDirs[0], ".."))); terr != nil {
                        logrus.WithError(terr).Warnf("failed to remove temporary overlay lowerdir's")
                }
        }
        for i, lowerDir := range lowerDirs {
                tmpLowerDir := filepath.Join(td, strconv.Itoa(i))
                tmpLowerDirs = append(tmpLowerDirs, tmpLowerDir)

                if err = os.MkdirAll(tmpLowerDir, 0700); err != nil {
                        return nil, cleanUp, fmt.Errorf("failed to create temporary dir: %w", err)
                }
                if err = IDMapMount(lowerDir, tmpLowerDir, usernsFd); err != nil {
                        return nil, cleanUp, err
                }
        }
        return tmpLowerDirs, cleanUp, nil
}

// Unmount the provided mount path with the flags
func Unmount(target string, flags int) error {
        if err := unmount(target, flags); err != nil && err != unix.EINVAL {
                return err
        }
        return nil
}

// fuseSuperMagic is defined in statfs(2)
const fuseSuperMagic = 0x65735546

func isFUSE(dir string) bool {
        var st unix.Statfs_t
        if err := unix.Statfs(dir, &st); err != nil {
                return false
        }
        return st.Type == fuseSuperMagic
}

// unmountFUSE attempts to unmount using fusermount/fusermount3 helper binary.
//
// For FUSE mounts, using these helper binaries is preferred, see:
// https://github.com/containerd/containerd/pull/3765#discussion_r342083514
func unmountFUSE(target string) error {
        var err error
        for _, helperBinary := range []string{"fusermount3", "fusermount"} {
                cmd := exec.Command(helperBinary, "-u", target)
                err = cmd.Run()
                if err == nil {
                        return nil
                }
        }
        return err
}

func unmount(target string, flags int) error {
        if isFUSE(target) {
                if err := unmountFUSE(target); err == nil {
                        return nil
                }
        }
        for i := 0; i < 50; i++ {
                if err := unix.Unmount(target, flags); err != nil {
                        switch err {
                        case unix.EBUSY:
                                time.Sleep(50 * time.Millisecond)
                                continue
                        default:
                                return err
                        }
                }
                return nil
        }
        return fmt.Errorf("failed to unmount target %s: %w", target, unix.EBUSY)
}

// UnmountAll repeatedly unmounts the given mount point until there
// are no mounts remaining (EINVAL is returned by mount), which is
// useful for undoing a stack of mounts on the same mount point.
// UnmountAll all is noop when the first argument is an empty string.
// This is done when the containerd client did not specify any rootfs
// mounts (e.g. because the rootfs is managed outside containerd)
// UnmountAll is noop when the mount path does not exist.
func UnmountAll(mount string, flags int) error {
        if mount == "" {
                return nil
        }
        if _, err := os.Stat(mount); os.IsNotExist(err) {
                return nil
        }

        for {
                if err := unmount(mount, flags); err != nil {
                        // EINVAL is returned if the target is not a
                        // mount point, indicating that we are
                        // done. It can also indicate a few other
                        // things (such as invalid flags) which we
                        // unfortunately end up squelching here too.
                        if err == unix.EINVAL {
                                return nil
                        }
                        return err
                }
        }
}

// parseMountOptions takes fstab style mount options and parses them for
// use with a standard mount() syscall
func parseMountOptions(options []string) (opt mountOpt) {
        loopOpt := "loop"
        flagsMap := map[string]struct {
                clear bool
                flag  int
        }{
                "async":         {true, unix.MS_SYNCHRONOUS},
                "atime":         {true, unix.MS_NOATIME},
                "bind":          {false, unix.MS_BIND},
                "defaults":      {false, 0},
                "dev":           {true, unix.MS_NODEV},
                "diratime":      {true, unix.MS_NODIRATIME},
                "dirsync":       {false, unix.MS_DIRSYNC},
                "exec":          {true, unix.MS_NOEXEC},
                "mand":          {false, unix.MS_MANDLOCK},
                "noatime":       {false, unix.MS_NOATIME},
                "nodev":         {false, unix.MS_NODEV},
                "nodiratime":    {false, unix.MS_NODIRATIME},
                "noexec":        {false, unix.MS_NOEXEC},
                "nomand":        {true, unix.MS_MANDLOCK},
                "norelatime":    {true, unix.MS_RELATIME},
                "nostrictatime": {true, unix.MS_STRICTATIME},
                "nosuid":        {false, unix.MS_NOSUID},
                "rbind":         {false, unix.MS_BIND | unix.MS_REC},
                "relatime":      {false, unix.MS_RELATIME},
                "remount":       {false, unix.MS_REMOUNT},
                "ro":            {false, unix.MS_RDONLY},
                "rw":            {true, unix.MS_RDONLY},
                "strictatime":   {false, unix.MS_STRICTATIME},
                "suid":          {true, unix.MS_NOSUID},
                "sync":          {false, unix.MS_SYNCHRONOUS},
        }
        for _, o := range options {
                // If the option does not exist in the flags table or the flag
                // is not supported on the platform,
                // then it is a data value for a specific fs type
                if f, exists := flagsMap[o]; exists && f.flag != 0 {
                        if f.clear {
                                opt.flags &^= f.flag
                        } else {
                                opt.flags |= f.flag
                        }
                } else if o == loopOpt {
                        opt.losetup = true
                } else if strings.HasPrefix(o, "uidmap=") {
                        opt.uidmap = strings.TrimPrefix(o, "uidmap=")
                } else if strings.HasPrefix(o, "gidmap=") {
                        opt.gidmap = strings.TrimPrefix(o, "gidmap=")
                } else {
                        opt.data = append(opt.data, o)
                }
        }
        return
}

func hasDirectIO(opts []string) (bool, []string) {
        for idx, opt := range opts {
                if opt == "direct-io" {
                        return true, append(opts[:idx], opts[idx+1:]...)
                }
        }
        return false, opts
}

// compactLowerdirOption updates overlay lowdir option and returns the common
// dir among all the lowdirs.
func compactLowerdirOption(opts []string) (string, []string) {
        idx, dirs := findOverlayLowerdirs(opts)
        if idx == -1 || len(dirs) == 1 {
                // no need to compact if there is only one lowerdir
                return "", opts
        }

        // find out common dir
        commondir := longestCommonPrefix(dirs)
        if commondir == "" {
                return "", opts
        }

        // NOTE: the snapshot id is based on digits.
        // in order to avoid to get snapshots/x, should be back to parent dir.
        // however, there is assumption that the common dir is ${root}/io.containerd.v1.overlayfs/snapshots.
        commondir = path.Dir(commondir)
        if commondir == "/" || commondir == "." {
                return "", opts
        }
        commondir = commondir + "/"

        newdirs := make([]string, 0, len(dirs))
        for _, dir := range dirs {
                if len(dir) <= len(commondir) {
                        return "", opts
                }
                newdirs = append(newdirs, dir[len(commondir):])
        }

        newopts := copyOptions(opts)
        newopts = append(newopts[:idx], newopts[idx+1:]...)
        newopts = append(newopts, fmt.Sprintf("lowerdir=%s", strings.Join(newdirs, ":")))
        return commondir, newopts
}

// findOverlayLowerdirs returns the index of lowerdir in mount's options and
// all the lowerdir target.
func findOverlayLowerdirs(opts []string) (int, []string) {
        var (
                idx    = -1
                prefix = "lowerdir="
        )

        for i, opt := range opts {
                if strings.HasPrefix(opt, prefix) {
                        idx = i
                        break
                }
        }

        if idx == -1 {
                return -1, nil
        }
        return idx, strings.Split(opts[idx][len(prefix):], ":")
}

// longestCommonPrefix finds the longest common prefix in the string slice.
func longestCommonPrefix(strs []string) string {
        if len(strs) == 0 {
                return ""
        } else if len(strs) == 1 {
                return strs[0]
        }

        // find out the min/max value by alphabetical order
        min, max := strs[0], strs[0]
        for _, str := range strs[1:] {
                if min > str {
                        min = str
                }
                if max < str {
                        max = str
                }
        }

        // find out the common part between min and max
        for i := 0; i < len(min) && i < len(max); i++ {
                if min[i] != max[i] {
                        return min[:i]
                }
        }
        return min
}

// copyOptions copies the options.
func copyOptions(opts []string) []string {
        if len(opts) == 0 {
                return nil
        }

        acopy := make([]string, len(opts))
        copy(acopy, opts)
        return acopy
}

// optionsSize returns the byte size of options of mount.
func optionsSize(opts []string) int {
        size := 0
        for _, opt := range opts {
                size += len(opt)
        }
        return size
}

func mountAt(chdir string, source, target, fstype string, flags uintptr, data string) error {
        if chdir == "" {
                return unix.Mount(source, target, fstype, flags, data)
        }

        ch := make(chan error, 1)
        go func() {
                runtime.LockOSThread()

                // Do not unlock this thread.
                // If the thread is unlocked go will try to use it for other goroutines.
                // However it is not possible to restore the thread state after CLONE_FS.
                //
                // Once the goroutine exits the thread should eventually be terminated by go.

                if err := unix.Unshare(unix.CLONE_FS); err != nil {
                        ch <- err
                        return
                }

                if err := unix.Chdir(chdir); err != nil {
                        ch <- err
                        return
                }

                ch <- unix.Mount(source, target, fstype, flags, data)
        }()
        return <-ch
}

func (m *Mount) mountWithHelper(helperBinary, typePrefix, target string) error {
        // helperBinary: "mount.fuse3"
        // target: "/foo/merged"
        // m.Type: "fuse3.fuse-overlayfs"
        // command: "mount.fuse3 overlay /foo/merged -o lowerdir=/foo/lower2:/foo/lower1,upperdir=/foo/upper,workdir=/foo/work -t fuse-overlayfs"
        args := []string{m.Source, target}
        for _, o := range m.Options {
                args = append(args, "-o", o)
        }
        args = append(args, "-t", strings.TrimPrefix(m.Type, typePrefix))

        infoBeforeMount, err := Lookup(target)
        if err != nil {
                return err
        }

        // cmd.CombinedOutput() may intermittently return ECHILD because of our signal handling in shim.
        // See #4387 and wait(2).
        const retriesOnECHILD = 10
        for i := 0; i < retriesOnECHILD; i++ {
                cmd := exec.Command(helperBinary, args...)
                out, err := cmd.CombinedOutput()
                if err == nil {
                        return nil
                }
                if !errors.Is(err, unix.ECHILD) {
                        return fmt.Errorf("mount helper [%s %v] failed: %q: %w", helperBinary, args, string(out), err)
                }
                // We got ECHILD, we are not sure whether the mount was successful.
                // If the mount ID has changed, we are sure we got some new mount, but still not sure it is fully completed.
                // So we attempt to unmount the new mount before retrying.
                infoAfterMount, err := Lookup(target)
                if err != nil {
                        return err
                }
                if infoAfterMount.ID != infoBeforeMount.ID {
                        _ = unmount(target, 0)
                }
        }
        return fmt.Errorf("mount helper [%s %v] failed with ECHILD (retried %d times)", helperBinary, args, retriesOnECHILD)
}

//go:build !windows && !darwin && !openbsd

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "os"
        "sort"

        "github.com/moby/sys/mountinfo"
)

// UnmountRecursive unmounts the target and all mounts underneath, starting
// with the deepest mount first.
func UnmountRecursive(target string, flags int) error {
        if target == "" {
                return nil
        }

        target, err := CanonicalizePath(target)
        if err != nil {
                if os.IsNotExist(err) {
                        err = nil
                }
                return err
        }

        mounts, err := mountinfo.GetMounts(mountinfo.PrefixFilter(target))
        if err != nil {
                return err
        }

        targetSet := make(map[string]struct{})
        for _, m := range mounts {
                targetSet[m.Mountpoint] = struct{}{}
        }

        var targets []string
        for m := range targetSet {
                targets = append(targets, m)
        }

        // Make the deepest mount be first
        sort.SliceStable(targets, func(i, j int) bool {
                return len(targets[i]) > len(targets[j])
        })

        for i, target := range targets {
                if err := UnmountAll(target, flags); err != nil {
                        if i == len(targets)-1 { // last mount
                                return err
                        }
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "context"
        "fmt"
        "os"

        "github.com/containerd/log"
)

var tempMountLocation = getTempDir()

// WithTempMount mounts the provided mounts to a temp dir, and pass the temp dir to f.
// The mounts are valid during the call to the f.
// The volatile option of overlayfs doesn't allow to mount again using the same upper / work dirs. Since it's a temp mount, avoid using that option here if found.
// Finally we will unmount and remove the temp dir regardless of the result of f.
func WithTempMount(ctx context.Context, mounts []Mount, f func(root string) error) (err error) {
        root, uerr := os.MkdirTemp(tempMountLocation, "containerd-mount")
        if uerr != nil {
                return fmt.Errorf("failed to create temp dir: %w", uerr)
        }
        // We use Remove here instead of RemoveAll.
        // The RemoveAll will delete the temp dir and all children it contains.
        // When the Unmount fails, RemoveAll will incorrectly delete data from
        // the mounted dir. However, if we use Remove, even though we won't
        // successfully delete the temp dir and it may leak, we won't loss data
        // from the mounted dir.
        // For details, please refer to #1868 #1785.
        defer func() {
                if uerr = os.Remove(root); uerr != nil {
                        log.G(ctx).WithError(uerr).WithField("dir", root).Error("failed to remove mount temp dir")
                }
        }()

        // We should do defer first, if not we will not do Unmount when only a part of Mounts are failed.
        defer func() {
                if uerr = UnmountMounts(mounts, root, 0); uerr != nil {
                        uerr = fmt.Errorf("failed to unmount %s: %w", root, uerr)
                        if err == nil {
                                err = uerr
                        } else {
                                err = fmt.Errorf("%s: %w", uerr.Error(), err)
                        }
                }
        }()

        if uerr = All(removeVolatileTempMount(mounts), root); uerr != nil {
                return fmt.Errorf("failed to mount %s: %w", root, uerr)
        }
        if err := f(root); err != nil {
                return fmt.Errorf("mount callback failed on %s: %w", root, err)
        }
        return nil
}

// removeVolatileTempMount The volatile option of overlayfs doesn't allow to mount again using the
// same upper / work dirs. Since it's a temp mount, avoid using that
// option here. Reference: https://docs.kernel.org/filesystems/overlayfs.html#volatile-mount
// TODO: Make this logic conditional once the kernel supports reusing
// overlayfs volatile mounts.
func removeVolatileTempMount(mounts []Mount) []Mount {
        var out []Mount
        for i, m := range mounts {
                if m.Type != "overlay" {
                        continue
                }
                for j, opt := range m.Options {
                        if opt == "volatile" {
                                if out == nil {
                                        out = copyMounts(mounts)
                                }
                                out[i].Options = append(out[i].Options[:j], out[i].Options[j+1:]...)
                                break
                        }
                }
        }

        if out != nil {
                return out
        }

        return mounts
}

// copyMounts creates a copy of the original slice to allow for modification and not altering the original
func copyMounts(in []Mount) []Mount {
        out := make([]Mount, len(in))
        copy(out, in)
        return out
}

// WithReadonlyTempMount mounts the provided mounts to a temp dir as readonly,
// and pass the temp dir to f. The mounts are valid during the call to the f.
// Finally we will unmount and remove the temp dir regardless of the result of f.
func WithReadonlyTempMount(ctx context.Context, mounts []Mount, f func(root string) error) (err error) {
        return WithTempMount(ctx, readonlyMounts(mounts), f)
}

func getTempDir() string {
        if xdg := os.Getenv("XDG_RUNTIME_DIR"); xdg != "" {
                return xdg
        }
        return os.TempDir()
}

//go:build !windows && !darwin

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package mount

import (
        "os"
        "sort"

        "github.com/moby/sys/mountinfo"
)

// SetTempMountLocation sets the temporary mount location
func SetTempMountLocation(root string) error {
        err := os.MkdirAll(root, 0700)
        if err != nil {
                return err
        }
        // We need to pass canonicalized path to mountinfo.PrefixFilter in CleanupTempMounts
        tempMountLocation, err = CanonicalizePath(root)
        return err
}

// CleanupTempMounts all temp mounts and remove the directories
func CleanupTempMounts(flags int) (warnings []error, err error) {
        mounts, err := mountinfo.GetMounts(mountinfo.PrefixFilter(tempMountLocation))
        if err != nil {
                return nil, err
        }

        // Make the deepest mount be first
        sort.Slice(mounts, func(i, j int) bool {
                return len(mounts[i].Mountpoint) > len(mounts[j].Mountpoint)
        })
        for _, mount := range mounts {
                if err := UnmountAll(mount.Mountpoint, flags); err != nil {
                        warnings = append(warnings, err)
                        continue
                }
                if err := os.Remove(mount.Mountpoint); err != nil {
                        warnings = append(warnings, err)
                }
        }
        return warnings, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package auth

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net/http"
        "net/url"
        "strings"
        "time"

        remoteserrors "github.com/containerd/containerd/v2/core/remotes/errors"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/log"
)

var (
        // ErrNoToken is returned if a request is successful but the body does not
        // contain an authorization token.
        ErrNoToken = errors.New("authorization server did not include a token in the response")
)

// GenerateTokenOptions generates options for fetching a token based on a challenge
func GenerateTokenOptions(ctx context.Context, host, username, secret string, c Challenge) (TokenOptions, error) {
        realm, ok := c.Parameters["realm"]
        if !ok {
                return TokenOptions{}, errors.New("no realm specified for token auth challenge")
        }

        realmURL, err := url.Parse(realm)
        if err != nil {
                return TokenOptions{}, fmt.Errorf("invalid token auth challenge realm: %w", err)
        }

        to := TokenOptions{
                Realm:    realmURL.String(),
                Service:  c.Parameters["service"],
                Username: username,
                Secret:   secret,
        }

        scope, ok := c.Parameters["scope"]
        if ok {
                to.Scopes = append(to.Scopes, strings.Split(scope, " ")...)
        } else {
                log.G(ctx).WithField("host", host).Debug("no scope specified for token auth challenge")
        }

        return to, nil
}

// TokenOptions are options for requesting a token
type TokenOptions struct {
        Realm    string
        Service  string
        Scopes   []string
        Username string
        Secret   string

        // FetchRefreshToken enables fetching a refresh token (aka "identity token", "offline token") along with the bearer token.
        //
        // For HTTP GET mode (FetchToken), FetchRefreshToken sets `offline_token=true` in the request.
        // https://docs.docker.com/registry/spec/auth/token/#requesting-a-token
        //
        // For HTTP POST mode (FetchTokenWithOAuth), FetchRefreshToken sets `access_type=offline` in the request.
        // https://docs.docker.com/registry/spec/auth/oauth/#getting-a-token
        FetchRefreshToken bool
}

// OAuthTokenResponse is response from fetching token with a OAuth POST request
type OAuthTokenResponse struct {
        AccessToken      string    `json:"access_token"`
        RefreshToken     string    `json:"refresh_token"`
        ExpiresInSeconds int       `json:"expires_in"`
        IssuedAt         time.Time `json:"issued_at"`
        Scope            string    `json:"scope"`
}

// FetchTokenWithOAuth fetches a token using a POST request
func FetchTokenWithOAuth(ctx context.Context, client *http.Client, headers http.Header, clientID string, to TokenOptions) (*OAuthTokenResponse, error) {
        form := url.Values{}
        if len(to.Scopes) > 0 {
                form.Set("scope", strings.Join(to.Scopes, " "))
        }
        form.Set("service", to.Service)
        form.Set("client_id", clientID)

        if to.Username == "" {
                form.Set("grant_type", "refresh_token")
                form.Set("refresh_token", to.Secret)
        } else {
                form.Set("grant_type", "password")
                form.Set("username", to.Username)
                form.Set("password", to.Secret)
        }
        if to.FetchRefreshToken {
                form.Set("access_type", "offline")
        }

        req, err := http.NewRequestWithContext(ctx, http.MethodPost, to.Realm, strings.NewReader(form.Encode()))
        if err != nil {
                return nil, err
        }
        req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=utf-8")
        for k, v := range headers {
                req.Header[k] = append(req.Header[k], v...)
        }
        if len(req.Header.Get("User-Agent")) == 0 {
                req.Header.Set("User-Agent", "containerd/"+version.Version)
        }

        resp, err := client.Do(req)
        if err != nil {
                return nil, err
        }
        defer resp.Body.Close()

        if resp.StatusCode < 200 || resp.StatusCode >= 400 {
                return nil, remoteserrors.NewUnexpectedStatusErr(resp)
        }

        decoder := json.NewDecoder(resp.Body)

        var tr OAuthTokenResponse
        if err = decoder.Decode(&tr); err != nil {
                return nil, fmt.Errorf("unable to decode token response: %w", err)
        }

        if tr.AccessToken == "" {
                return nil, ErrNoToken
        }

        return &tr, nil
}

// FetchTokenResponse is response from fetching token with GET request
type FetchTokenResponse struct {
        Token            string    `json:"token"`
        AccessToken      string    `json:"access_token"`
        ExpiresInSeconds int       `json:"expires_in"`
        IssuedAt         time.Time `json:"issued_at"`
        RefreshToken     string    `json:"refresh_token"`
}

// FetchToken fetches a token using a GET request
func FetchToken(ctx context.Context, client *http.Client, headers http.Header, to TokenOptions) (*FetchTokenResponse, error) {
        req, err := http.NewRequestWithContext(ctx, http.MethodGet, to.Realm, nil)
        if err != nil {
                return nil, err
        }

        for k, v := range headers {
                req.Header[k] = append(req.Header[k], v...)
        }
        if len(req.Header.Get("User-Agent")) == 0 {
                req.Header.Set("User-Agent", "containerd/"+version.Version)
        }

        reqParams := req.URL.Query()

        if to.Service != "" {
                reqParams.Add("service", to.Service)
        }

        for _, scope := range to.Scopes {
                reqParams.Add("scope", scope)
        }

        if to.Secret != "" {
                req.SetBasicAuth(to.Username, to.Secret)
        }

        if to.FetchRefreshToken {
                reqParams.Add("offline_token", "true")
        }

        req.URL.RawQuery = reqParams.Encode()

        resp, err := client.Do(req)
        if err != nil {
                return nil, err
        }
        defer resp.Body.Close()

        if resp.StatusCode < 200 || resp.StatusCode >= 400 {
                return nil, remoteserrors.NewUnexpectedStatusErr(resp)
        }

        decoder := json.NewDecoder(resp.Body)

        var tr FetchTokenResponse
        if err = decoder.Decode(&tr); err != nil {
                return nil, fmt.Errorf("unable to decode token response: %w", err)
        }

        // `access_token` is equivalent to `token` and if both are specified
        // the choice is undefined.  Canonicalize `access_token` by sticking
        // things in `token`.
        if tr.AccessToken != "" {
                tr.Token = tr.AccessToken
        }

        if tr.Token == "" {
                return nil, ErrNoToken
        }

        return &tr, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package auth

import (
        "net/http"
        "sort"
        "strings"
)

// AuthenticationScheme defines scheme of the authentication method
type AuthenticationScheme byte

const (
        // BasicAuth is scheme for Basic HTTP Authentication RFC 7617
        BasicAuth AuthenticationScheme = 1 << iota
        // DigestAuth is scheme for HTTP Digest Access Authentication RFC 7616
        DigestAuth
        // BearerAuth is scheme for OAuth 2.0 Bearer Tokens RFC 6750
        BearerAuth
)

// Challenge carries information from a WWW-Authenticate response header.
// See RFC 2617.
type Challenge struct {
        // scheme is the auth-scheme according to RFC 2617
        Scheme AuthenticationScheme

        // parameters are the auth-params according to RFC 2617
        Parameters map[string]string
}

type byScheme []Challenge

func (bs byScheme) Len() int      { return len(bs) }
func (bs byScheme) Swap(i, j int) { bs[i], bs[j] = bs[j], bs[i] }

// Less sorts in priority order: token > digest > basic
func (bs byScheme) Less(i, j int) bool { return bs[i].Scheme > bs[j].Scheme }

// Octet types from RFC 2616.
type octetType byte

var octetTypes [256]octetType

const (
        isToken octetType = 1 << iota
        isSpace
)

func init() {
        // OCTET      = <any 8-bit sequence of data>
        // CHAR       = <any US-ASCII character (octets 0 - 127)>
        // CTL        = <any US-ASCII control character (octets 0 - 31) and DEL (127)>
        // CR         = <US-ASCII CR, carriage return (13)>
        // LF         = <US-ASCII LF, linefeed (10)>
        // SP         = <US-ASCII SP, space (32)>
        // HT         = <US-ASCII HT, horizontal-tab (9)>
        // <">        = <US-ASCII double-quote mark (34)>
        // CRLF       = CR LF
        // LWS        = [CRLF] 1*( SP | HT )
        // TEXT       = <any OCTET except CTLs, but including LWS>
        // separators = "(" | ")" | "<" | ">" | "@" | "," | ";" | ":" | "\" | <">
        //              | "/" | "[" | "]" | "?" | "=" | "{" | "}" | SP | HT
        // token      = 1*<any CHAR except CTLs or separators>
        // qdtext     = <any TEXT except <">>

        for c := 0; c < 256; c++ {
                var t octetType
                isCtl := c <= 31 || c == 127
                isChar := 0 <= c && c <= 127
                isSeparator := strings.ContainsRune(" \t\"(),/:;<=>?@[]\\{}", rune(c))
                if strings.ContainsRune(" \t\r\n", rune(c)) {
                        t |= isSpace
                }
                if isChar && !isCtl && !isSeparator {
                        t |= isToken
                }
                octetTypes[c] = t
        }
}

// ParseAuthHeader parses challenges from WWW-Authenticate header
func ParseAuthHeader(header http.Header) []Challenge {
        challenges := []Challenge{}
        for _, h := range header[http.CanonicalHeaderKey("WWW-Authenticate")] {
                v, p := parseValueAndParams(h)
                var s AuthenticationScheme
                switch v {
                case "basic":
                        s = BasicAuth
                case "digest":
                        s = DigestAuth
                case "bearer":
                        s = BearerAuth
                default:
                        continue
                }
                challenges = append(challenges, Challenge{Scheme: s, Parameters: p})
        }
        sort.Stable(byScheme(challenges))
        return challenges
}

func parseValueAndParams(header string) (value string, params map[string]string) {
        params = make(map[string]string)
        value, s := expectToken(header)
        if value == "" {
                return
        }
        value = strings.ToLower(value)
        for {
                var pkey string
                pkey, s = expectToken(skipSpace(s))
                if pkey == "" {
                        return
                }
                if !strings.HasPrefix(s, "=") {
                        return
                }
                var pvalue string
                pvalue, s = expectTokenOrQuoted(s[1:])
                pkey = strings.ToLower(pkey)
                params[pkey] = pvalue
                s = skipSpace(s)
                if !strings.HasPrefix(s, ",") {
                        return
                }
                s = s[1:]
        }
}

func skipSpace(s string) (rest string) {
        i := 0
        for ; i < len(s); i++ {
                if octetTypes[s[i]]&isSpace == 0 {
                        break
                }
        }
        return s[i:]
}

func expectToken(s string) (token, rest string) {
        i := 0
        for ; i < len(s); i++ {
                if octetTypes[s[i]]&isToken == 0 {
                        break
                }
        }
        return s[:i], s[i:]
}

func expectTokenOrQuoted(s string) (value string, rest string) {
        if !strings.HasPrefix(s, "\"") {
                return expectToken(s)
        }
        s = s[1:]
        for i := 0; i < len(s); i++ {
                switch s[i] {
                case '"':
                        return s[:i], s[i+1:]
                case '\\':
                        p := make([]byte, len(s)-1)
                        j := copy(p, s[:i])
                        escape := true
                        for i = i + 1; i < len(s); i++ {
                                b := s[i]
                                switch {
                                case escape:
                                        escape = false
                                        p[j] = b
                                        j++
                                case b == '\\':
                                        escape = true
                                case b == '"':
                                        return string(p[:j]), s[i+1:]
                                default:
                                        p[j] = b
                                        j++
                                }
                        }
                        return "", ""
                }
        }
        return "", ""
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "encoding/base64"
        "errors"
        "fmt"
        "net/http"
        "strings"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/remotes/docker/auth"
        remoteerrors "github.com/containerd/containerd/v2/core/remotes/errors"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

type dockerAuthorizer struct {
        credentials func(string) (string, string, error)

        client *http.Client
        header http.Header
        mu     sync.RWMutex

        // indexed by host name
        handlers map[string]*authHandler

        onFetchRefreshToken OnFetchRefreshToken
}

type authorizerConfig struct {
        credentials         func(string) (string, string, error)
        client              *http.Client
        header              http.Header
        onFetchRefreshToken OnFetchRefreshToken
}

// AuthorizerOpt configures an authorizer
type AuthorizerOpt func(*authorizerConfig)

// WithAuthClient provides the HTTP client for the authorizer
func WithAuthClient(client *http.Client) AuthorizerOpt {
        return func(opt *authorizerConfig) {
                opt.client = client
        }
}

// WithAuthCreds provides a credential function to the authorizer
func WithAuthCreds(creds func(string) (string, string, error)) AuthorizerOpt {
        return func(opt *authorizerConfig) {
                opt.credentials = creds
        }
}

// WithAuthHeader provides HTTP headers for authorization
func WithAuthHeader(hdr http.Header) AuthorizerOpt {
        return func(opt *authorizerConfig) {
                opt.header = hdr
        }
}

// OnFetchRefreshToken is called on fetching request token.
type OnFetchRefreshToken func(ctx context.Context, refreshToken string, req *http.Request)

// WithFetchRefreshToken enables fetching "refresh token" (aka "identity token", "offline token").
func WithFetchRefreshToken(f OnFetchRefreshToken) AuthorizerOpt {
        return func(opt *authorizerConfig) {
                opt.onFetchRefreshToken = f
        }
}

// NewDockerAuthorizer creates an authorizer using Docker's registry
// authentication spec.
// See https://docs.docker.com/registry/spec/auth/
func NewDockerAuthorizer(opts ...AuthorizerOpt) Authorizer {
        var ao authorizerConfig
        for _, opt := range opts {
                opt(&ao)
        }

        if ao.client == nil {
                ao.client = http.DefaultClient
        }

        return &dockerAuthorizer{
                credentials:         ao.credentials,
                client:              ao.client,
                header:              ao.header,
                handlers:            make(map[string]*authHandler),
                onFetchRefreshToken: ao.onFetchRefreshToken,
        }
}

// Authorize handles auth request.
func (a *dockerAuthorizer) Authorize(ctx context.Context, req *http.Request) error {
        // skip if there is no auth handler
        ah := a.getAuthHandler(req.URL.Host)
        if ah == nil {
                return nil
        }

        auth, refreshToken, err := ah.authorize(ctx)
        if err != nil {
                return err
        }

        req.Header.Set("Authorization", auth)

        if refreshToken != "" {
                a.mu.RLock()
                onFetchRefreshToken := a.onFetchRefreshToken
                a.mu.RUnlock()
                if onFetchRefreshToken != nil {
                        onFetchRefreshToken(ctx, refreshToken, req)
                }
        }
        return nil
}

func (a *dockerAuthorizer) getAuthHandler(host string) *authHandler {
        a.mu.Lock()
        defer a.mu.Unlock()

        return a.handlers[host]
}

func (a *dockerAuthorizer) AddResponses(ctx context.Context, responses []*http.Response) error {
        last := responses[len(responses)-1]
        host := last.Request.URL.Host

        a.mu.Lock()
        defer a.mu.Unlock()
        for _, c := range auth.ParseAuthHeader(last.Header) {
                if c.Scheme == auth.BearerAuth {
                        if retry, err := invalidAuthorization(ctx, c, responses); err != nil {
                                delete(a.handlers, host)
                                return err
                        } else if retry {
                                delete(a.handlers, host)
                        }

                        // reuse existing handler
                        //
                        // assume that one registry will return the common
                        // challenge information, including realm and service.
                        // and the resource scope is only different part
                        // which can be provided by each request.
                        if _, ok := a.handlers[host]; ok {
                                return nil
                        }

                        var username, secret string
                        if a.credentials != nil {
                                var err error
                                username, secret, err = a.credentials(host)
                                if err != nil {
                                        return err
                                }
                        }

                        common, err := auth.GenerateTokenOptions(ctx, host, username, secret, c)
                        if err != nil {
                                return err
                        }
                        common.FetchRefreshToken = a.onFetchRefreshToken != nil

                        a.handlers[host] = newAuthHandler(a.client, a.header, c.Scheme, common)
                        return nil
                } else if c.Scheme == auth.BasicAuth && a.credentials != nil {
                        username, secret, err := a.credentials(host)
                        if err != nil {
                                return err
                        }

                        if username == "" || secret == "" {
                                return fmt.Errorf("%w: no basic auth credentials", ErrInvalidAuthorization)
                        }

                        a.handlers[host] = newAuthHandler(a.client, a.header, c.Scheme, auth.TokenOptions{
                                Username: username,
                                Secret:   secret,
                        })
                        return nil
                }
        }
        return fmt.Errorf("failed to find supported auth scheme: %w", errdefs.ErrNotImplemented)
}

// authResult is used to control limit rate.
type authResult struct {
        sync.WaitGroup
        token          string
        refreshToken   string
        expirationTime *time.Time
        err            error
}

// authHandler is used to handle auth request per registry server.
type authHandler struct {
        sync.Mutex

        header http.Header

        client *http.Client

        // only support basic and bearer schemes
        scheme auth.AuthenticationScheme

        // common contains common challenge answer
        common auth.TokenOptions

        // scopedTokens caches token indexed by scopes, which used in
        // bearer auth case
        scopedTokens map[string]*authResult
}

func newAuthHandler(client *http.Client, hdr http.Header, scheme auth.AuthenticationScheme, opts auth.TokenOptions) *authHandler {
        return &authHandler{
                header:       hdr,
                client:       client,
                scheme:       scheme,
                common:       opts,
                scopedTokens: map[string]*authResult{},
        }
}

func (ah *authHandler) authorize(ctx context.Context) (string, string, error) {
        switch ah.scheme {
        case auth.BasicAuth:
                return ah.doBasicAuth(ctx)
        case auth.BearerAuth:
                return ah.doBearerAuth(ctx)
        default:
                return "", "", fmt.Errorf("failed to find supported auth scheme: %s: %w", string(ah.scheme), errdefs.ErrNotImplemented)
        }
}

func (ah *authHandler) doBasicAuth(ctx context.Context) (string, string, error) {
        username, secret := ah.common.Username, ah.common.Secret

        if username == "" || secret == "" {
                return "", "", fmt.Errorf("failed to handle basic auth because missing username or secret")
        }

        auth := base64.StdEncoding.EncodeToString([]byte(username + ":" + secret))
        return fmt.Sprintf("Basic %s", auth), "", nil
}

func (ah *authHandler) doBearerAuth(ctx context.Context) (token, refreshToken string, err error) {
        // copy common tokenOptions
        to := ah.common

        to.Scopes = GetTokenScopes(ctx, to.Scopes)

        // Docs: https://docs.docker.com/registry/spec/auth/scope
        scoped := strings.Join(to.Scopes, " ")

        // Keep track of the expiration time of cached bearer tokens so they can be
        // refreshed when they expire without a server roundtrip.
        var expirationTime *time.Time

        ah.Lock()
        if r, exist := ah.scopedTokens[scoped]; exist && (r.expirationTime == nil || r.expirationTime.After(time.Now())) {
                ah.Unlock()
                r.Wait()
                return r.token, r.refreshToken, r.err
        }

        // only one fetch token job
        r := new(authResult)
        r.Add(1)
        ah.scopedTokens[scoped] = r
        ah.Unlock()

        defer func() {
                token = fmt.Sprintf("Bearer %s", token)
                r.token, r.refreshToken, r.err, r.expirationTime = token, refreshToken, err, expirationTime
                r.Done()
        }()

        // fetch token for the resource scope
        if to.Secret != "" {
                defer func() {
                        if err != nil {
                                err = fmt.Errorf("failed to fetch oauth token: %w", err)
                        }
                }()
                // credential information is provided, use oauth POST endpoint
                // TODO: Allow setting client_id
                resp, err := auth.FetchTokenWithOAuth(ctx, ah.client, ah.header, "containerd-client", to)
                if err != nil {
                        var errStatus remoteerrors.ErrUnexpectedStatus
                        if errors.As(err, &errStatus) {
                                // Registries without support for POST may return 404 for POST /v2/token.
                                // As of September 2017, GCR is known to return 404.
                                // As of February 2018, JFrog Artifactory is known to return 401.
                                // As of January 2022, ACR is known to return 400.
                                if (errStatus.StatusCode == 405 && to.Username != "") || errStatus.StatusCode == 404 || errStatus.StatusCode == 401 || errStatus.StatusCode == 400 {
                                        resp, err := auth.FetchToken(ctx, ah.client, ah.header, to)
                                        if err != nil {
                                                return "", "", err
                                        }
                                        expirationTime = getExpirationTime(resp.ExpiresInSeconds)
                                        return resp.Token, resp.RefreshToken, nil
                                }
                                log.G(ctx).WithFields(log.Fields{
                                        "status": errStatus.Status,
                                        "body":   string(errStatus.Body),
                                }).Debugf("token request failed")
                        }
                        return "", "", err
                }
                expirationTime = getExpirationTime(resp.ExpiresInSeconds)
                return resp.AccessToken, resp.RefreshToken, nil
        }
        // do request anonymously
        resp, err := auth.FetchToken(ctx, ah.client, ah.header, to)
        if err != nil {
                return "", "", fmt.Errorf("failed to fetch anonymous token: %w", err)
        }
        expirationTime = getExpirationTime(resp.ExpiresInSeconds)
        return resp.Token, resp.RefreshToken, nil
}

func getExpirationTime(expiresInSeconds int) *time.Time {
        if expiresInSeconds <= 0 {
                return nil
        }
        expirationTime := time.Now().Add(time.Duration(expiresInSeconds) * time.Second)
        return &expirationTime
}

func invalidAuthorization(ctx context.Context, c auth.Challenge, responses []*http.Response) (retry bool, _ error) {
        errStr := c.Parameters["error"]
        if errStr == "" {
                return retry, nil
        }

        n := len(responses)
        if n == 1 || (n > 1 && !sameRequest(responses[n-2].Request, responses[n-1].Request)) {
                limitedErr := errStr
                errLenghLimit := 64
                if len(limitedErr) > errLenghLimit {
                        limitedErr = limitedErr[:errLenghLimit] + "..."
                }
                log.G(ctx).WithField("error", limitedErr).Debug("authorization error using bearer token, retrying")
                return true, nil
        }

        return retry, fmt.Errorf("server message: %s: %w", errStr, ErrInvalidAuthorization)
}

func sameRequest(r1, r2 *http.Request) bool {
        if r1.Method != r2.Method {
                return false
        }
        if *r1.URL != *r2.URL {
                return false
        }
        return true
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package config

import (
        "crypto/x509"
        "path/filepath"
)

func hostPaths(root, host string) (hosts []string) {
        ch := hostDirectory(host)
        if ch != host {
                hosts = append(hosts, filepath.Join(root, ch))
        }

        hosts = append(hosts,
                filepath.Join(root, host),
                filepath.Join(root, "_default"),
        )

        return
}

func rootSystemPool() (*x509.CertPool, error) {
        return x509.SystemCertPool()
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package config

import (
        "os"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

func FuzzParseHostsFile(data []byte) int {
        f := fuzz.NewConsumer(data)
        dir, err := os.MkdirTemp("", "fuzz-")
        if err != nil {
                return 0
        }
        err = f.CreateFiles(dir)
        if err != nil {
                return 0
        }
        defer os.RemoveAll(dir)
        b, err := f.GetBytes()
        if err != nil {
                return 0
        }
        _, _ = parseHostsFile(dir, b)
        return 1
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package config contains utilities for helping configure the Docker resolver
package config

import (
        "context"
        "crypto/tls"
        "fmt"
        "net"
        "net/http"
        "net/url"
        "os"
        "path"
        "path/filepath"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/pelletier/go-toml/v2"
        tomlu "github.com/pelletier/go-toml/v2/unstable"
)

// UpdateClientFunc is a function that lets you to amend http Client behavior used by registry clients.
type UpdateClientFunc func(client *http.Client) error

type hostConfig struct {
        scheme string
        host   string
        path   string

        capabilities docker.HostCapabilities

        caCerts     []string
        clientPairs [][2]string
        skipVerify  *bool

        header http.Header

        // TODO: Add credential configuration (domain alias, username)
}

// HostOptions is used to configure registry hosts
type HostOptions struct {
        HostDir       func(string) (string, error)
        Credentials   func(host string) (string, string, error)
        DefaultTLS    *tls.Config
        DefaultScheme string
        // UpdateClient will be called after creating http.Client object, so clients can provide extra configuration
        UpdateClient   UpdateClientFunc
        AuthorizerOpts []docker.AuthorizerOpt
}

// ConfigureHosts creates a registry hosts function from the provided
// host creation options. The host directory can read hosts.toml or
// certificate files laid out in the Docker specific layout.
// If a `HostDir` function is not required, defaults are used.
func ConfigureHosts(ctx context.Context, options HostOptions) docker.RegistryHosts {
        return func(host string) ([]docker.RegistryHost, error) {
                var hosts []hostConfig
                if options.HostDir != nil {
                        dir, err := options.HostDir(host)
                        if err != nil && !errdefs.IsNotFound(err) {
                                return nil, err
                        }
                        if dir != "" {
                                log.G(ctx).WithField("dir", dir).Debug("loading host directory")
                                hosts, err = loadHostDir(ctx, dir)
                                if err != nil {
                                        return nil, err
                                }
                        }
                }

                // If hosts was not set, add a default host
                // NOTE: Check nil here and not empty, the host may be
                // intentionally configured to not have any endpoints
                if hosts == nil {
                        hosts = make([]hostConfig, 1)
                }
                if len(hosts) > 0 && hosts[len(hosts)-1].host == "" {
                        if host == "docker.io" {
                                hosts[len(hosts)-1].scheme = "https"
                                hosts[len(hosts)-1].host = "registry-1.docker.io"
                        } else if docker.IsLocalhost(host) {
                                hosts[len(hosts)-1].host = host
                                if options.DefaultScheme == "" {
                                        _, port, _ := net.SplitHostPort(host)
                                        if port == "" || port == "443" {
                                                // If port is default or 443, only use https
                                                hosts[len(hosts)-1].scheme = "https"
                                        } else {
                                                // HTTP fallback logic will be used when protocol is ambiguous
                                                hosts[len(hosts)-1].scheme = "http"
                                        }

                                        // When port is 80, protocol is not ambiguous
                                        if port != "80" {
                                                // Skipping TLS verification for localhost
                                                var skipVerify = true
                                                hosts[len(hosts)-1].skipVerify = &skipVerify
                                        }
                                } else {
                                        hosts[len(hosts)-1].scheme = options.DefaultScheme
                                }
                        } else {
                                hosts[len(hosts)-1].host = host
                                if options.DefaultScheme != "" {
                                        hosts[len(hosts)-1].scheme = options.DefaultScheme
                                } else {
                                        hosts[len(hosts)-1].scheme = "https"
                                }
                        }
                        hosts[len(hosts)-1].path = "/v2"
                        hosts[len(hosts)-1].capabilities = docker.HostCapabilityPull | docker.HostCapabilityResolve | docker.HostCapabilityPush
                }

                // tlsConfigured indicates that TLS was configured and HTTP endpoints should
                // attempt to use the TLS configuration before falling back to HTTP
                var tlsConfigured bool

                var defaultTLSConfig *tls.Config
                if options.DefaultTLS != nil {
                        tlsConfigured = true
                        defaultTLSConfig = options.DefaultTLS
                } else {
                        defaultTLSConfig = &tls.Config{}
                }

                defaultTransport := &http.Transport{
                        Proxy: http.ProxyFromEnvironment,
                        DialContext: (&net.Dialer{
                                Timeout:       30 * time.Second,
                                KeepAlive:     30 * time.Second,
                                FallbackDelay: 300 * time.Millisecond,
                        }).DialContext,
                        MaxIdleConns:          10,
                        IdleConnTimeout:       30 * time.Second,
                        TLSHandshakeTimeout:   10 * time.Second,
                        TLSClientConfig:       defaultTLSConfig,
                        ExpectContinueTimeout: 5 * time.Second,
                }

                client := &http.Client{
                        Transport: defaultTransport,
                }
                if options.UpdateClient != nil {
                        if err := options.UpdateClient(client); err != nil {
                                return nil, err
                        }
                }

                authOpts := []docker.AuthorizerOpt{docker.WithAuthClient(client)}
                if options.Credentials != nil {
                        authOpts = append(authOpts, docker.WithAuthCreds(options.Credentials))
                }
                authOpts = append(authOpts, options.AuthorizerOpts...)
                authorizer := docker.NewDockerAuthorizer(authOpts...)

                rhosts := make([]docker.RegistryHost, len(hosts))
                for i, host := range hosts {
                        // Allow setting for each host as well
                        explicitTLS := tlsConfigured

                        if host.caCerts != nil || host.clientPairs != nil || host.skipVerify != nil {
                                explicitTLS = true
                                tr := defaultTransport.Clone()
                                tlsConfig := tr.TLSClientConfig
                                if host.skipVerify != nil {
                                        tlsConfig.InsecureSkipVerify = *host.skipVerify
                                }
                                if host.caCerts != nil {
                                        if tlsConfig.RootCAs == nil {
                                                rootPool, err := rootSystemPool()
                                                if err != nil {
                                                        return nil, fmt.Errorf("unable to initialize cert pool: %w", err)
                                                }
                                                tlsConfig.RootCAs = rootPool
                                        }
                                        for _, f := range host.caCerts {
                                                data, err := os.ReadFile(f)
                                                if err != nil {
                                                        return nil, fmt.Errorf("unable to read CA cert %q: %w", f, err)
                                                }
                                                if !tlsConfig.RootCAs.AppendCertsFromPEM(data) {
                                                        return nil, fmt.Errorf("unable to load CA cert %q", f)
                                                }
                                        }
                                }

                                for _, pair := range host.clientPairs {
                                        certPEMBlock, err := os.ReadFile(pair[0])
                                        if err != nil {
                                                return nil, fmt.Errorf("unable to read CERT file %q: %w", pair[0], err)
                                        }
                                        var keyPEMBlock []byte
                                        if pair[1] != "" {
                                                keyPEMBlock, err = os.ReadFile(pair[1])
                                                if err != nil {
                                                        return nil, fmt.Errorf("unable to read CERT file %q: %w", pair[1], err)
                                                }
                                        } else {
                                                // Load key block from same PEM file
                                                keyPEMBlock = certPEMBlock
                                        }
                                        cert, err := tls.X509KeyPair(certPEMBlock, keyPEMBlock)
                                        if err != nil {
                                                return nil, fmt.Errorf("failed to load X509 key pair: %w", err)
                                        }

                                        tlsConfig.Certificates = append(tlsConfig.Certificates, cert)
                                }

                                c := *client
                                c.Transport = tr
                                if options.UpdateClient != nil {
                                        if err := options.UpdateClient(&c); err != nil {
                                                return nil, err
                                        }
                                }

                                rhosts[i].Client = &c
                                rhosts[i].Authorizer = docker.NewDockerAuthorizer(append(authOpts, docker.WithAuthClient(&c))...)
                        } else {
                                rhosts[i].Client = client
                                rhosts[i].Authorizer = authorizer
                        }

                        // When TLS has been configured for the operation or host and
                        // the protocol from the port number is ambiguous, use the
                        // docker.NewHTTPFallback roundtripper to catch TLS errors and re-attempt the
                        // request as http. This allows preference for https when configured but
                        // also catches TLS errors early enough in the request to avoid sending
                        // the request twice or consuming the request body.
                        if host.scheme == "http" && explicitTLS {
                                _, port, _ := net.SplitHostPort(host.host)
                                if port != "" && port != "80" {
                                        log.G(ctx).WithField("host", host.host).Info("host will try HTTPS first since it is configured for HTTP with a TLS configuration, consider changing host to HTTPS or removing unused TLS configuration")
                                        host.scheme = "https"
                                        rhosts[i].Client.Transport = docker.NewHTTPFallback(rhosts[i].Client.Transport)
                                }
                        }

                        rhosts[i].Scheme = host.scheme
                        rhosts[i].Host = host.host
                        rhosts[i].Path = host.path
                        rhosts[i].Capabilities = host.capabilities
                        rhosts[i].Header = host.header
                }

                return rhosts, nil
        }

}

// HostDirFromRoot returns a function which finds a host directory
// based at the given root.
func HostDirFromRoot(root string) func(string) (string, error) {
        return func(host string) (string, error) {
                for _, p := range hostPaths(root, host) {
                        if _, err := os.Stat(p); err == nil {
                                return p, nil
                        } else if !os.IsNotExist(err) {
                                return "", err
                        }
                }
                return "", errdefs.ErrNotFound
        }
}

// hostDirectory converts ":port" to "_port_" in directory names
func hostDirectory(host string) string {
        idx := strings.LastIndex(host, ":")
        if idx > 0 {
                return host[:idx] + "_" + host[idx+1:] + "_"
        }
        return host
}

func loadHostDir(ctx context.Context, hostsDir string) ([]hostConfig, error) {
        b, err := os.ReadFile(filepath.Join(hostsDir, "hosts.toml"))
        if err != nil && !os.IsNotExist(err) {
                return nil, err
        }

        if len(b) == 0 {
                // If hosts.toml does not exist, fallback to checking for
                // certificate files based on Docker's certificate file
                // pattern (".crt", ".cert", ".key" files)
                return loadCertFiles(ctx, hostsDir)
        }

        hosts, err := parseHostsFile(hostsDir, b)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("failed to decode %s", filepath.Join(hostsDir, "hosts.toml"))
                // Fallback to checking certificate files
                return loadCertFiles(ctx, hostsDir)
        }

        return hosts, nil
}

type hostFileConfig struct {
        // Capabilities determine what operations a host is
        // capable of performing. Allowed values
        //  - pull
        //  - resolve
        //  - push
        Capabilities []string `toml:"capabilities"`

        // CACert are the public key certificates for TLS
        // Accepted types
        // - string - Single file with certificate(s)
        // - []string - Multiple files with certificates
        CACert interface{} `toml:"ca"`

        // Client keypair(s) for TLS with client authentication
        // Accepted types
        // - string - Single file with public and private keys
        // - []string - Multiple files with public and private keys
        // - [][2]string - Multiple keypairs with public and private keys in separate files
        Client interface{} `toml:"client"`

        // SkipVerify skips verification of the server's certificate chain
        // and host name. This should only be used for testing or in
        // combination with other methods of verifying connections.
        SkipVerify *bool `toml:"skip_verify"`

        // Header are additional header files to send to the server
        Header map[string]interface{} `toml:"header"`

        // OverridePath indicates the API root endpoint is defined in the URL
        // path rather than by the API specification.
        // This may be used with non-compliant OCI registries to override the
        // API root endpoint.
        OverridePath bool `toml:"override_path"`

        // TODO: Credentials: helper? name? username? alternate domain? token?
}

func parseHostsFile(baseDir string, b []byte) ([]hostConfig, error) {
        orderedHosts, err := getSortedHosts(b)
        if err != nil {
                return nil, err
        }

        c := struct {
                hostFileConfig
                // Server specifies the default server. When `host` is
                // also specified, those hosts are tried first.
                Server string `toml:"server"`
                // HostConfigs store the per-host configuration
                HostConfigs map[string]hostFileConfig `toml:"host"`
        }{}

        var (
                hosts []hostConfig
        )

        if err := toml.Unmarshal(b, &c); err != nil {
                return nil, err
        }

        // Parse hosts array
        for _, host := range orderedHosts {
                config := c.HostConfigs[host]

                parsed, err := parseHostConfig(host, baseDir, config)
                if err != nil {
                        return nil, err
                }
                hosts = append(hosts, parsed)
        }

        // Parse root host config and append it as the last element
        parsed, err := parseHostConfig(c.Server, baseDir, c.hostFileConfig)
        if err != nil {
                return nil, err
        }
        hosts = append(hosts, parsed)

        return hosts, nil
}

func parseHostConfig(server string, baseDir string, config hostFileConfig) (hostConfig, error) {
        var (
                result = hostConfig{}
                err    error
        )

        if server != "" {
                if !strings.HasPrefix(server, "http") {
                        server = "https://" + server
                }
                u, err := url.Parse(server)
                if err != nil {
                        return hostConfig{}, fmt.Errorf("unable to parse server %v: %w", server, err)
                }
                result.scheme = u.Scheme
                result.host = u.Host
                if len(u.Path) > 0 {
                        u.Path = path.Clean(u.Path)
                        if !strings.HasSuffix(u.Path, "/v2") && !config.OverridePath {
                                u.Path = u.Path + "/v2"
                        }
                } else if !config.OverridePath {
                        u.Path = "/v2"
                }
                result.path = u.Path
        }

        result.skipVerify = config.SkipVerify

        if len(config.Capabilities) > 0 {
                for _, c := range config.Capabilities {
                        switch strings.ToLower(c) {
                        case "pull":
                                result.capabilities |= docker.HostCapabilityPull
                        case "resolve":
                                result.capabilities |= docker.HostCapabilityResolve
                        case "push":
                                result.capabilities |= docker.HostCapabilityPush
                        default:
                                return hostConfig{}, fmt.Errorf("unknown capability %v", c)
                        }
                }
        } else {
                result.capabilities = docker.HostCapabilityPull | docker.HostCapabilityResolve | docker.HostCapabilityPush
        }

        if config.CACert != nil {
                switch cert := config.CACert.(type) {
                case string:
                        result.caCerts = []string{makeAbsPath(cert, baseDir)}
                case []interface{}:
                        result.caCerts, err = makeStringSlice(cert, func(p string) string {
                                return makeAbsPath(p, baseDir)
                        })
                        if err != nil {
                                return hostConfig{}, err
                        }
                default:
                        return hostConfig{}, fmt.Errorf("invalid type %v for \"ca\"", cert)
                }
        }

        if config.Client != nil {
                switch client := config.Client.(type) {
                case string:
                        result.clientPairs = [][2]string{{makeAbsPath(client, baseDir), ""}}
                case []interface{}:
                        // []string or [][2]string
                        for _, pairs := range client {
                                switch p := pairs.(type) {
                                case string:
                                        result.clientPairs = append(result.clientPairs, [2]string{makeAbsPath(p, baseDir), ""})
                                case []interface{}:
                                        slice, err := makeStringSlice(p, func(s string) string {
                                                return makeAbsPath(s, baseDir)
                                        })
                                        if err != nil {
                                                return hostConfig{}, err
                                        }
                                        if len(slice) != 2 {
                                                return hostConfig{}, fmt.Errorf("invalid pair %v for \"client\"", p)
                                        }

                                        var pair [2]string
                                        copy(pair[:], slice)
                                        result.clientPairs = append(result.clientPairs, pair)
                                default:
                                        return hostConfig{}, fmt.Errorf("invalid type %T for \"client\"", p)
                                }
                        }
                default:
                        return hostConfig{}, fmt.Errorf("invalid type %v for \"client\"", client)
                }
        }

        if config.Header != nil {
                header := http.Header{}
                for key, ty := range config.Header {
                        switch value := ty.(type) {
                        case string:
                                header[key] = []string{value}
                        case []interface{}:
                                header[key], err = makeStringSlice(value, nil)
                                if err != nil {
                                        return hostConfig{}, err
                                }
                        default:
                                return hostConfig{}, fmt.Errorf("invalid type %v for header %q", ty, key)
                        }
                }
                result.header = header
        }

        return result, nil
}

// getSortedHosts returns the list of hosts in the order are they defined in the file.
func getSortedHosts(b []byte) ([]string, error) {
        var hostsInOrder []string

        // Use toml unstable package for directly parsing toml
        // See https://github.com/pelletier/go-toml/discussions/801#discussioncomment-7083586
        p := tomlu.Parser{}
        p.Reset(b)

        var host string
        // iterate over all top level expressions
        for p.NextExpression() {
                e := p.Expression()

                if e.Kind != tomlu.Table {
                        continue
                }

                // Let's look at the key. It's an iterator over the multiple dotted parts of the key.
                var parts []string
                for it := e.Key(); it.Next(); {
                        parts = append(parts, string(it.Node().Data))
                }

                // only consider keys that look like `hosts.XXX`
                // and skip subtables such as `hosts.XXX.header`
                if len(parts) < 2 || parts[0] != "host" || parts[1] == host {
                        continue
                }

                host = parts[1]
                hostsInOrder = append(hostsInOrder, host)
        }

        return hostsInOrder, nil
}

// makeStringSlice is a helper func to convert from []interface{} to []string.
// Additionally an optional cb func may be passed to perform string mapping.
func makeStringSlice(slice []interface{}, cb func(string) string) ([]string, error) {
        out := make([]string, len(slice))
        for i, value := range slice {
                str, ok := value.(string)
                if !ok {
                        return nil, fmt.Errorf("unable to cast %v to string", value)
                }

                if cb != nil {
                        out[i] = cb(str)
                } else {
                        out[i] = str
                }
        }
        return out, nil
}

func makeAbsPath(p string, base string) string {
        if filepath.IsAbs(p) {
                return p
        }
        return filepath.Join(base, p)
}

// loadCertsDir loads certs from certsDir like "/etc/docker/certs.d" .
// Compatible with Docker file layout
//   - files ending with ".crt" are treated as CA certificate files
//   - files ending with ".cert" are treated as client certificates, and
//     files with the same name but ending with ".key" are treated as the
//     corresponding private key.
//     NOTE: If a ".key" file is missing, this function will just return
//     the ".cert", which may contain the private key. If the ".cert" file
//     does not contain the private key, the caller should detect and error.
func loadCertFiles(ctx context.Context, certsDir string) ([]hostConfig, error) {
        fs, err := os.ReadDir(certsDir)
        if err != nil && !os.IsNotExist(err) {
                return nil, err
        }
        hosts := make([]hostConfig, 1)
        for _, f := range fs {
                if f.IsDir() {
                        continue
                }
                if strings.HasSuffix(f.Name(), ".crt") {
                        hosts[0].caCerts = append(hosts[0].caCerts, filepath.Join(certsDir, f.Name()))
                }
                if strings.HasSuffix(f.Name(), ".cert") {
                        var pair [2]string
                        certFile := f.Name()
                        pair[0] = filepath.Join(certsDir, certFile)
                        // Check if key also exists
                        keyFile := filepath.Join(certsDir, certFile[:len(certFile)-5]+".key")
                        if _, err := os.Stat(keyFile); err == nil {
                                pair[1] = keyFile
                        } else if !os.IsNotExist(err) {
                                return nil, err
                        }
                        hosts[0].clientPairs = append(hosts[0].clientPairs, pair)
                }
        }
        return hosts, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "bytes"
        "context"
        "encoding/json"
        "fmt"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// LegacyConfigMediaType should be replaced by OCI image spec.
//
// More detail: docker/distribution#1622
const LegacyConfigMediaType = "application/octet-stream"

// ConvertManifest changes application/octet-stream to schema2 config media type if need.
//
// NOTE:
// 1. original manifest will be deleted by next gc round.
// 2. don't cover manifest list.
func ConvertManifest(ctx context.Context, store content.Store, desc ocispec.Descriptor) (ocispec.Descriptor, error) {
        if !images.IsManifestType(desc.MediaType) {
                log.G(ctx).Warnf("do nothing for media type: %s", desc.MediaType)
                return desc, nil
        }

        // read manifest data
        mb, err := content.ReadBlob(ctx, store, desc)
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to read index data: %w", err)
        }

        var manifest ocispec.Manifest
        if err := json.Unmarshal(mb, &manifest); err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to unmarshal data into manifest: %w", err)
        }

        // check config media type
        if manifest.Config.MediaType != LegacyConfigMediaType {
                return desc, nil
        }

        manifest.Config.MediaType = images.MediaTypeDockerSchema2Config
        data, err := json.MarshalIndent(manifest, "", "   ")
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to marshal manifest: %w", err)
        }

        // update manifest with gc labels
        desc.Digest = digest.Canonical.FromBytes(data)
        desc.Size = int64(len(data))

        labels := map[string]string{}
        for i, c := range append([]ocispec.Descriptor{manifest.Config}, manifest.Layers...) {
                labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i)] = c.Digest.String()
        }

        ref := remotes.MakeRefKey(ctx, desc)
        if err := content.WriteBlob(ctx, store, ref, bytes.NewReader(data), desc, content.WithLabels(labels)); err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to update content: %w", err)
        }
        return desc, nil
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "os"

        fuzz "github.com/AdaLogics/go-fuzz-headers"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/log"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func FuzzConvertManifest(data []byte) int {
        ctx := context.Background()

        // Do not log the message below
        // level=warning msg="do nothing for media type: ..."
        log.G(ctx).Logger.SetLevel(log.PanicLevel)

        f := fuzz.NewConsumer(data)
        desc := ocispec.Descriptor{}
        err := f.GenerateStruct(&desc)
        if err != nil {
                return 0
        }
        tmpdir, err := os.MkdirTemp("", "fuzzing-")
        if err != nil {
                return 0
        }
        cs, err := local.NewStore(tmpdir)
        if err != nil {
                return 0
        }
        _, _ = ConvertManifest(ctx, cs, desc)
        return 1
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "encoding/json"
        "fmt"
        "strings"
)

// ErrorCoder is the base interface for ErrorCode and Error allowing
// users of each to just call ErrorCode to get the real ID of each
type ErrorCoder interface {
        ErrorCode() ErrorCode
}

// ErrorCode represents the error type. The errors are serialized via strings
// and the integer format may change and should *never* be exported.
type ErrorCode int

var _ error = ErrorCode(0)

// ErrorCode just returns itself
func (ec ErrorCode) ErrorCode() ErrorCode {
        return ec
}

// Error returns the ID/Value
func (ec ErrorCode) Error() string {
        // NOTE(stevvooe): Cannot use message here since it may have unpopulated args.
        return strings.ToLower(strings.Replace(ec.String(), "_", " ", -1))
}

// Descriptor returns the descriptor for the error code.
func (ec ErrorCode) Descriptor() ErrorDescriptor {
        d, ok := errorCodeToDescriptors[ec]

        if !ok {
                return ErrorCodeUnknown.Descriptor()
        }

        return d
}

// String returns the canonical identifier for this error code.
func (ec ErrorCode) String() string {
        return ec.Descriptor().Value
}

// Message returned the human-readable error message for this error code.
func (ec ErrorCode) Message() string {
        return ec.Descriptor().Message
}

// MarshalText encodes the receiver into UTF-8-encoded text and returns the
// result.
func (ec ErrorCode) MarshalText() (text []byte, err error) {
        return []byte(ec.String()), nil
}

// UnmarshalText decodes the form generated by MarshalText.
func (ec *ErrorCode) UnmarshalText(text []byte) error {
        desc, ok := idToDescriptors[string(text)]

        if !ok {
                desc = ErrorCodeUnknown.Descriptor()
        }

        *ec = desc.Code

        return nil
}

// WithMessage creates a new Error struct based on the passed-in info and
// overrides the Message property.
func (ec ErrorCode) WithMessage(message string) Error {
        return Error{
                Code:    ec,
                Message: message,
        }
}

// WithDetail creates a new Error struct based on the passed-in info and
// set the Detail property appropriately
func (ec ErrorCode) WithDetail(detail interface{}) Error {
        return Error{
                Code:    ec,
                Message: ec.Message(),
        }.WithDetail(detail)
}

// WithArgs creates a new Error struct and sets the Args slice
func (ec ErrorCode) WithArgs(args ...interface{}) Error {
        return Error{
                Code:    ec,
                Message: ec.Message(),
        }.WithArgs(args...)
}

// Error provides a wrapper around ErrorCode with extra Details provided.
type Error struct {
        Code    ErrorCode   `json:"code"`
        Message string      `json:"message"`
        Detail  interface{} `json:"detail,omitempty"`

        // TODO(duglin): See if we need an "args" property so we can do the
        // variable substitution right before showing the message to the user
}

var _ error = Error{}

// ErrorCode returns the ID/Value of this Error
func (e Error) ErrorCode() ErrorCode {
        return e.Code
}

// Error returns a human readable representation of the error.
func (e Error) Error() string {
        return fmt.Sprintf("%s: %s", e.Code.Error(), e.Message)
}

// WithDetail will return a new Error, based on the current one, but with
// some Detail info added
func (e Error) WithDetail(detail interface{}) Error {
        return Error{
                Code:    e.Code,
                Message: e.Message,
                Detail:  detail,
        }
}

// WithArgs uses the passed-in list of interface{} as the substitution
// variables in the Error's Message string, but returns a new Error
func (e Error) WithArgs(args ...interface{}) Error {
        return Error{
                Code:    e.Code,
                Message: fmt.Sprintf(e.Code.Message(), args...),
                Detail:  e.Detail,
        }
}

// ErrorDescriptor provides relevant information about a given error code.
type ErrorDescriptor struct {
        // Code is the error code that this descriptor describes.
        Code ErrorCode

        // Value provides a unique, string key, often captilized with
        // underscores, to identify the error code. This value is used as the
        // keyed value when serializing api errors.
        Value string

        // Message is a short, human readable description of the error condition
        // included in API responses.
        Message string

        // Description provides a complete account of the errors purpose, suitable
        // for use in documentation.
        Description string

        // HTTPStatusCode provides the http status code that is associated with
        // this error condition.
        HTTPStatusCode int
}

// ParseErrorCode returns the value by the string error code.
// `ErrorCodeUnknown` will be returned if the error is not known.
func ParseErrorCode(value string) ErrorCode {
        ed, ok := idToDescriptors[value]
        if ok {
                return ed.Code
        }

        return ErrorCodeUnknown
}

// Errors provides the envelope for multiple errors and a few sugar methods
// for use within the application.
type Errors []error

var _ error = Errors{}

func (errs Errors) Error() string {
        switch len(errs) {
        case 0:
                return "<nil>"
        case 1:
                return errs[0].Error()
        default:
                msg := "errors:\n"
                for _, err := range errs {
                        msg += err.Error() + "\n"
                }
                return msg
        }
}

// Len returns the current number of errors.
func (errs Errors) Len() int {
        return len(errs)
}

// MarshalJSON converts slice of error, ErrorCode or Error into a
// slice of Error - then serializes
func (errs Errors) MarshalJSON() ([]byte, error) {
        var tmpErrs struct {
                Errors []Error `json:"errors,omitempty"`
        }

        for _, daErr := range errs {
                var err Error

                switch daErr := daErr.(type) {
                case ErrorCode:
                        err = daErr.WithDetail(nil)
                case Error:
                        err = daErr
                default:
                        err = ErrorCodeUnknown.WithDetail(daErr)

                }

                // If the Error struct was setup and they forgot to set the
                // Message field (meaning its "") then grab it from the ErrCode
                msg := err.Message
                if msg == "" {
                        msg = err.Code.Message()
                }

                tmpErrs.Errors = append(tmpErrs.Errors, Error{
                        Code:    err.Code,
                        Message: msg,
                        Detail:  err.Detail,
                })
        }

        return json.Marshal(tmpErrs)
}

// UnmarshalJSON deserializes []Error and then converts it into slice of
// Error or ErrorCode
func (errs *Errors) UnmarshalJSON(data []byte) error {
        var tmpErrs struct {
                Errors []Error
        }

        if err := json.Unmarshal(data, &tmpErrs); err != nil {
                return err
        }

        var newErrs Errors
        for _, daErr := range tmpErrs.Errors {
                // If Message is empty or exactly matches the Code's message string
                // then just use the Code, no need for a full Error struct
                if daErr.Detail == nil && (daErr.Message == "" || daErr.Message == daErr.Code.Message()) {
                        // Error's w/o details get converted to ErrorCode
                        newErrs = append(newErrs, daErr.Code)
                } else {
                        // Error's w/ details are untouched
                        newErrs = append(newErrs, Error{
                                Code:    daErr.Code,
                                Message: daErr.Message,
                                Detail:  daErr.Detail,
                        })
                }
        }

        *errs = newErrs
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "fmt"
        "net/http"
        "sort"
        "sync"
)

var (
        errorCodeToDescriptors = map[ErrorCode]ErrorDescriptor{}
        idToDescriptors        = map[string]ErrorDescriptor{}
        groupToDescriptors     = map[string][]ErrorDescriptor{}
)

var (
        // ErrorCodeUnknown is a generic error that can be used as a last
        // resort if there is no situation-specific error message that can be used
        ErrorCodeUnknown = Register("errcode", ErrorDescriptor{
                Value:   "UNKNOWN",
                Message: "unknown error",
                Description: `Generic error returned when the error does not have an
                                                                    API classification.`,
                HTTPStatusCode: http.StatusInternalServerError,
        })

        // ErrorCodeUnsupported is returned when an operation is not supported.
        ErrorCodeUnsupported = Register("errcode", ErrorDescriptor{
                Value:   "UNSUPPORTED",
                Message: "The operation is unsupported.",
                Description: `The operation was unsupported due to a missing
                implementation or invalid set of parameters.`,
                HTTPStatusCode: http.StatusMethodNotAllowed,
        })

        // ErrorCodeUnauthorized is returned if a request requires
        // authentication.
        ErrorCodeUnauthorized = Register("errcode", ErrorDescriptor{
                Value:   "UNAUTHORIZED",
                Message: "authentication required",
                Description: `The access controller was unable to authenticate
                the client. Often this will be accompanied by a
                Www-Authenticate HTTP response header indicating how to
                authenticate.`,
                HTTPStatusCode: http.StatusUnauthorized,
        })

        // ErrorCodeDenied is returned if a client does not have sufficient
        // permission to perform an action.
        ErrorCodeDenied = Register("errcode", ErrorDescriptor{
                Value:   "DENIED",
                Message: "requested access to the resource is denied",
                Description: `The access controller denied access for the
                operation on a resource.`,
                HTTPStatusCode: http.StatusForbidden,
        })

        // ErrorCodeUnavailable provides a common error to report unavailability
        // of a service or endpoint.
        ErrorCodeUnavailable = Register("errcode", ErrorDescriptor{
                Value:          "UNAVAILABLE",
                Message:        "service unavailable",
                Description:    "Returned when a service is not available",
                HTTPStatusCode: http.StatusServiceUnavailable,
        })

        // ErrorCodeTooManyRequests is returned if a client attempts too many
        // times to contact a service endpoint.
        ErrorCodeTooManyRequests = Register("errcode", ErrorDescriptor{
                Value:   "TOOMANYREQUESTS",
                Message: "too many requests",
                Description: `Returned when a client attempts to contact a
                service too many times`,
                HTTPStatusCode: http.StatusTooManyRequests,
        })
)

var nextCode = 1000
var registerLock sync.Mutex

// Register will make the passed-in error known to the environment and
// return a new ErrorCode
func Register(group string, descriptor ErrorDescriptor) ErrorCode {
        registerLock.Lock()
        defer registerLock.Unlock()

        descriptor.Code = ErrorCode(nextCode)

        if _, ok := idToDescriptors[descriptor.Value]; ok {
                panic(fmt.Sprintf("ErrorValue %q is already registered", descriptor.Value))
        }
        if _, ok := errorCodeToDescriptors[descriptor.Code]; ok {
                panic(fmt.Sprintf("ErrorCode %v is already registered", descriptor.Code))
        }

        groupToDescriptors[group] = append(groupToDescriptors[group], descriptor)
        errorCodeToDescriptors[descriptor.Code] = descriptor
        idToDescriptors[descriptor.Value] = descriptor

        nextCode++
        return descriptor.Code
}

type byValue []ErrorDescriptor

func (a byValue) Len() int           { return len(a) }
func (a byValue) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
func (a byValue) Less(i, j int) bool { return a[i].Value < a[j].Value }

// GetGroupNames returns the list of Error group names that are registered
func GetGroupNames() []string {
        keys := []string{}

        for k := range groupToDescriptors {
                keys = append(keys, k)
        }
        sort.Strings(keys)
        return keys
}

// GetErrorCodeGroup returns the named group of error descriptors
func GetErrorCodeGroup(name string) []ErrorDescriptor {
        desc := groupToDescriptors[name]
        sort.Sort(byValue(desc))
        return desc
}

// GetErrorAllDescriptors returns a slice of all ErrorDescriptors that are
// registered, irrespective of what group they're in
func GetErrorAllDescriptors() []ErrorDescriptor {
        result := []ErrorDescriptor{}

        for _, group := range GetGroupNames() {
                result = append(result, GetErrorCodeGroup(group)...)
        }
        sort.Sort(byValue(result))
        return result
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "compress/flate"
        "compress/gzip"
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net/http"
        "net/url"
        "strings"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/klauspost/compress/zstd"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type dockerFetcher struct {
        *dockerBase
}

func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error) {
        ctx = log.WithLogger(ctx, log.G(ctx).WithField("digest", desc.Digest))

        hosts := r.filterHosts(HostCapabilityPull)
        if len(hosts) == 0 {
                return nil, fmt.Errorf("no pull hosts: %w", errdefs.ErrNotFound)
        }

        ctx, err := ContextWithRepositoryScope(ctx, r.refspec, false)
        if err != nil {
                return nil, err
        }

        return newHTTPReadSeeker(desc.Size, func(offset int64) (io.ReadCloser, error) {
                // firstly try fetch via external urls
                for _, us := range desc.URLs {
                        u, err := url.Parse(us)
                        if err != nil {
                                log.G(ctx).WithError(err).Debugf("failed to parse %q", us)
                                continue
                        }
                        if u.Scheme != "http" && u.Scheme != "https" {
                                log.G(ctx).Debug("non-http(s) alternative url is unsupported")
                                continue
                        }
                        ctx = log.WithLogger(ctx, log.G(ctx).WithField("url", u))
                        log.G(ctx).Info("request")

                        // Try this first, parse it
                        host := RegistryHost{
                                Client:       http.DefaultClient,
                                Host:         u.Host,
                                Scheme:       u.Scheme,
                                Path:         u.Path,
                                Capabilities: HostCapabilityPull,
                        }
                        req := r.request(host, http.MethodGet)
                        // Strip namespace from base
                        req.path = u.Path
                        if u.RawQuery != "" {
                                req.path = req.path + "?" + u.RawQuery
                        }

                        rc, err := r.open(ctx, req, desc.MediaType, offset)
                        if err != nil {
                                if errdefs.IsNotFound(err) {
                                        continue // try one of the other urls.
                                }

                                return nil, err
                        }

                        return rc, nil
                }

                // Try manifests endpoints for manifests types
                if images.IsManifestType(desc.MediaType) || images.IsIndexType(desc.MediaType) ||
                        desc.MediaType == images.MediaTypeDockerSchema1Manifest {

                        var firstErr error
                        for _, host := range r.hosts {
                                req := r.request(host, http.MethodGet, "manifests", desc.Digest.String())
                                if err := req.addNamespace(r.refspec.Hostname()); err != nil {
                                        return nil, err
                                }

                                rc, err := r.open(ctx, req, desc.MediaType, offset)
                                if err != nil {
                                        // Store the error for referencing later
                                        if firstErr == nil {
                                                firstErr = err
                                        }
                                        continue // try another host
                                }

                                return rc, nil
                        }

                        return nil, firstErr
                }

                // Finally use blobs endpoints
                var firstErr error
                for _, host := range r.hosts {
                        req := r.request(host, http.MethodGet, "blobs", desc.Digest.String())
                        if err := req.addNamespace(r.refspec.Hostname()); err != nil {
                                return nil, err
                        }

                        rc, err := r.open(ctx, req, desc.MediaType, offset)
                        if err != nil {
                                // Store the error for referencing later
                                if firstErr == nil {
                                        firstErr = err
                                }
                                continue // try another host
                        }

                        return rc, nil
                }

                if errdefs.IsNotFound(firstErr) {
                        firstErr = fmt.Errorf("could not fetch content descriptor %v (%v) from remote: %w",
                                desc.Digest, desc.MediaType, errdefs.ErrNotFound,
                        )
                }

                return nil, firstErr

        })
}

func (r dockerFetcher) createGetReq(ctx context.Context, host RegistryHost, mediatype string, ps ...string) (*request, int64, error) {
        headReq := r.request(host, http.MethodHead, ps...)
        if err := headReq.addNamespace(r.refspec.Hostname()); err != nil {
                return nil, 0, err
        }

        if mediatype == "" {
                headReq.header.Set("Accept", "*/*")
        } else {
                headReq.header.Set("Accept", strings.Join([]string{mediatype, `*/*`}, ", "))
        }

        headResp, err := headReq.doWithRetries(ctx, nil)
        if err != nil {
                return nil, 0, err
        }
        if headResp.Body != nil {
                headResp.Body.Close()
        }
        if headResp.StatusCode > 299 {
                return nil, 0, fmt.Errorf("unexpected HEAD status code %v: %s", headReq.String(), headResp.Status)
        }

        getReq := r.request(host, http.MethodGet, ps...)
        if err := getReq.addNamespace(r.refspec.Hostname()); err != nil {
                return nil, 0, err
        }
        return getReq, headResp.ContentLength, nil
}

func (r dockerFetcher) FetchByDigest(ctx context.Context, dgst digest.Digest, opts ...remotes.FetchByDigestOpts) (io.ReadCloser, ocispec.Descriptor, error) {
        var desc ocispec.Descriptor
        ctx = log.WithLogger(ctx, log.G(ctx).WithField("digest", dgst))
        var config remotes.FetchByDigestConfig
        for _, o := range opts {
                if err := o(ctx, &config); err != nil {
                        return nil, desc, err
                }
        }

        hosts := r.filterHosts(HostCapabilityPull)
        if len(hosts) == 0 {
                return nil, desc, fmt.Errorf("no pull hosts: %w", errdefs.ErrNotFound)
        }

        ctx, err := ContextWithRepositoryScope(ctx, r.refspec, false)
        if err != nil {
                return nil, desc, err
        }

        var (
                getReq   *request
                sz       int64
                firstErr error
        )

        for _, host := range r.hosts {
                getReq, sz, err = r.createGetReq(ctx, host, config.Mediatype, "blobs", dgst.String())
                if err == nil {
                        break
                }
                // Store the error for referencing later
                if firstErr == nil {
                        firstErr = err
                }
        }

        if getReq == nil {
                // Fall back to the "manifests" endpoint
                for _, host := range r.hosts {
                        getReq, sz, err = r.createGetReq(ctx, host, config.Mediatype, "manifests", dgst.String())
                        if err == nil {
                                break
                        }
                        // Store the error for referencing later
                        if firstErr == nil {
                                firstErr = err
                        }
                }
        }

        if getReq == nil {
                if errdefs.IsNotFound(firstErr) {
                        firstErr = fmt.Errorf("could not fetch content %v from remote: %w", dgst, errdefs.ErrNotFound)
                }
                if firstErr == nil {
                        firstErr = fmt.Errorf("could not fetch content %v from remote: (unknown)", dgst)
                }
                return nil, desc, firstErr
        }

        seeker, err := newHTTPReadSeeker(sz, func(offset int64) (io.ReadCloser, error) {
                return r.open(ctx, getReq, config.Mediatype, offset)
        })
        if err != nil {
                return nil, desc, err
        }

        desc = ocispec.Descriptor{
                MediaType: "application/octet-stream",
                Digest:    dgst,
                Size:      sz,
        }
        if config.Mediatype != "" {
                desc.MediaType = config.Mediatype
        }
        return seeker, desc, nil
}

func (r dockerFetcher) open(ctx context.Context, req *request, mediatype string, offset int64) (_ io.ReadCloser, retErr error) {
        if mediatype == "" {
                req.header.Set("Accept", "*/*")
        } else {
                req.header.Set("Accept", strings.Join([]string{mediatype, `*/*`}, ", "))
        }
        req.header.Set("Accept-Encoding", "zstd;q=1.0, gzip;q=0.8, deflate;q=0.5")

        if offset > 0 {
                // Note: "Accept-Ranges: bytes" cannot be trusted as some endpoints
                // will return the header without supporting the range. The content
                // range must always be checked.
                req.header.Set("Range", fmt.Sprintf("bytes=%d-", offset))
        }

        resp, err := req.doWithRetries(ctx, nil)
        if err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        resp.Body.Close()
                }
        }()

        if resp.StatusCode > 299 {
                // TODO(stevvooe): When doing a offset specific request, we should
                // really distinguish between a 206 and a 200. In the case of 200, we
                // can discard the bytes, hiding the seek behavior from the
                // implementation.

                if resp.StatusCode == http.StatusNotFound {
                        return nil, fmt.Errorf("content at %v not found: %w", req.String(), errdefs.ErrNotFound)
                }
                var registryErr Errors
                if err := json.NewDecoder(resp.Body).Decode(&registryErr); err != nil || registryErr.Len() < 1 {
                        return nil, fmt.Errorf("unexpected status code %v: %v", req.String(), resp.Status)
                }
                return nil, fmt.Errorf("unexpected status code %v: %s - Server message: %s", req.String(), resp.Status, registryErr.Error())
        }
        if offset > 0 {
                cr := resp.Header.Get("content-range")
                if cr != "" {
                        if !strings.HasPrefix(cr, fmt.Sprintf("bytes %d-", offset)) {
                                return nil, fmt.Errorf("unhandled content range in response: %v", cr)

                        }
                } else {
                        // TODO: Should any cases where use of content range
                        // without the proper header be considered?
                        // 206 responses?

                        // Discard up to offset
                        // Could use buffer pool here but this case should be rare
                        n, err := io.Copy(io.Discard, io.LimitReader(resp.Body, offset))
                        if err != nil {
                                return nil, fmt.Errorf("failed to discard to offset: %w", err)
                        }
                        if n != offset {
                                return nil, errors.New("unable to discard to offset")
                        }

                }
        }

        body := resp.Body
        encoding := strings.FieldsFunc(resp.Header.Get("Content-Encoding"), func(r rune) bool {
                return r == ' ' || r == '\t' || r == ','
        })
        for i := len(encoding) - 1; i >= 0; i-- {
                algorithm := strings.ToLower(encoding[i])
                switch algorithm {
                case "zstd":
                        r, err := zstd.NewReader(body)
                        if err != nil {
                                return nil, err
                        }
                        body = r.IOReadCloser()
                case "gzip":
                        body, err = gzip.NewReader(body)
                        if err != nil {
                                return nil, err
                        }
                case "deflate":
                        body = flate.NewReader(body)
                case "identity", "":
                        // no content-encoding applied, use raw body
                default:
                        return nil, errors.New("unsupported Content-Encoding algorithm: " + algorithm)
                }
        }

        return body, nil
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "fmt"
        "io"
        "net/http"
        "net/http/httptest"
        "net/url"
        "strconv"
)

func FuzzFetcher(data []byte) int {
        dataLen := len(data)
        if dataLen == 0 {
                return -1
        }

        s := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
                rw.Header().Set("content-range", fmt.Sprintf("bytes %d-%d/%d", 0, dataLen-1, dataLen))
                rw.Header().Set("content-length", strconv.Itoa(dataLen))
                rw.Write(data)
        }))
        defer s.Close()

        u, err := url.Parse(s.URL)
        if err != nil {
                return 0
        }

        f := dockerFetcher{&dockerBase{
                repository: "nonempty",
        }}
        host := RegistryHost{
                Client: s.Client(),
                Host:   u.Host,
                Scheme: u.Scheme,
                Path:   u.Path,
        }

        ctx := context.Background()
        req := f.request(host, http.MethodGet)
        rc, err := f.open(ctx, req, "", 0)
        if err != nil {
                return 0
        }
        b, err := io.ReadAll(rc)
        if err != nil {
                return 0
        }

        expected := data
        if len(b) != len(expected) {
                panic("len of request is not equal to len of expected but should be")
        }
        return 1
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "fmt"
        "net/url"
        "strings"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/reference"
        "github.com/containerd/log"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// AppendDistributionSourceLabel updates the label of blob with distribution source.
func AppendDistributionSourceLabel(manager content.Manager, ref string) (images.HandlerFunc, error) {
        refspec, err := reference.Parse(ref)
        if err != nil {
                return nil, err
        }

        u, err := url.Parse("dummy://" + refspec.Locator)
        if err != nil {
                return nil, err
        }

        source, repo := u.Hostname(), strings.TrimPrefix(u.Path, "/")
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                info, err := manager.Info(ctx, desc.Digest)
                if err != nil {
                        return nil, err
                }

                key := distributionSourceLabelKey(source)

                originLabel := ""
                if info.Labels != nil {
                        originLabel = info.Labels[key]
                }
                value := appendDistributionSourceLabel(originLabel, repo)

                // The repo name has been limited under 256 and the distribution
                // label might hit the limitation of label size, when blob data
                // is used as the very, very common layer.
                if err := labels.Validate(key, value); err != nil {
                        log.G(ctx).Warnf("skip to append distribution label: %s", err)
                        return nil, nil
                }

                info = content.Info{
                        Digest: desc.Digest,
                        Labels: map[string]string{
                                key: value,
                        },
                }
                _, err = manager.Update(ctx, info, fmt.Sprintf("labels.%s", key))
                return nil, err
        }, nil
}

func appendDistributionSourceLabel(originLabel, repo string) string {
        repos := []string{}
        if originLabel != "" {
                repos = strings.Split(originLabel, ",")
        }
        repos = append(repos, repo)

        // use empty string to present duplicate items
        for i := 1; i < len(repos); i++ {
                tmp, j := repos[i], i-1
                for ; j >= 0 && repos[j] >= tmp; j-- {
                        if repos[j] == tmp {
                                tmp = ""
                        }
                        repos[j+1] = repos[j]
                }
                repos[j+1] = tmp
        }

        i := 0
        for ; i < len(repos) && repos[i] == ""; i++ {
        }

        return strings.Join(repos[i:], ",")
}

func distributionSourceLabelKey(source string) string {
        return labels.LabelDistributionSource + "." + source
}

// selectRepositoryMountCandidate will select the repo which has longest
// common prefix components as the candidate.
func selectRepositoryMountCandidate(refspec reference.Spec, sources map[string]string) string {
        u, err := url.Parse("dummy://" + refspec.Locator)
        if err != nil {
                // NOTE: basically, it won't be error here
                return ""
        }

        source, target := u.Hostname(), strings.TrimPrefix(u.Path, "/")
        repoLabel, ok := sources[distributionSourceLabelKey(source)]
        if !ok || repoLabel == "" {
                return ""
        }

        n, match := 0, ""
        components := strings.Split(target, "/")
        for _, repo := range strings.Split(repoLabel, ",") {
                // the target repo is not a candidate
                if repo == target {
                        continue
                }

                if l := commonPrefixComponents(components, repo); l >= n {
                        n, match = l, repo
                }
        }
        return match
}

func commonPrefixComponents(components []string, target string) int {
        targetComponents := strings.Split(target, "/")

        i := 0
        for ; i < len(components) && i < len(targetComponents); i++ {
                if components[i] != targetComponents[i] {
                        break
                }
        }
        return i
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "bytes"
        "fmt"
        "io"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

const maxRetry = 3

type httpReadSeeker struct {
        size   int64
        offset int64
        rc     io.ReadCloser
        open   func(offset int64) (io.ReadCloser, error)
        closed bool

        errsWithNoProgress int
}

func newHTTPReadSeeker(size int64, open func(offset int64) (io.ReadCloser, error)) (io.ReadCloser, error) {
        return &httpReadSeeker{
                size: size,
                open: open,
        }, nil
}

func (hrs *httpReadSeeker) Read(p []byte) (n int, err error) {
        if hrs.closed {
                return 0, io.EOF
        }

        rd, err := hrs.reader()
        if err != nil {
                return 0, err
        }

        n, err = rd.Read(p)
        hrs.offset += int64(n)
        if n > 0 || err == nil {
                hrs.errsWithNoProgress = 0
        }
        if err == io.ErrUnexpectedEOF {
                // connection closed unexpectedly. try reconnecting.
                if n == 0 {
                        hrs.errsWithNoProgress++
                        if hrs.errsWithNoProgress > maxRetry {
                                return // too many retries for this offset with no progress
                        }
                }
                if hrs.rc != nil {
                        if clsErr := hrs.rc.Close(); clsErr != nil {
                                log.L.WithError(clsErr).Error("httpReadSeeker: failed to close ReadCloser")
                        }
                        hrs.rc = nil
                }
                if _, err2 := hrs.reader(); err2 == nil {
                        return n, nil
                }
        } else if err == io.EOF {
                // The CRI's imagePullProgressTimeout relies on responseBody.Close to
                // update the process monitor's status. If the err is io.EOF, close
                // the connection since there is no more available data.
                if hrs.rc != nil {
                        if clsErr := hrs.rc.Close(); clsErr != nil {
                                log.L.WithError(clsErr).Error("httpReadSeeker: failed to close ReadCloser after io.EOF")
                        }
                        hrs.rc = nil
                }
        }
        return
}

func (hrs *httpReadSeeker) Close() error {
        if hrs.closed {
                return nil
        }
        hrs.closed = true
        if hrs.rc != nil {
                return hrs.rc.Close()
        }

        return nil
}

func (hrs *httpReadSeeker) Seek(offset int64, whence int) (int64, error) {
        if hrs.closed {
                return 0, fmt.Errorf("Fetcher.Seek: closed: %w", errdefs.ErrUnavailable)
        }

        abs := hrs.offset
        switch whence {
        case io.SeekStart:
                abs = offset
        case io.SeekCurrent:
                abs += offset
        case io.SeekEnd:
                if hrs.size == -1 {
                        return 0, fmt.Errorf("Fetcher.Seek: unknown size, cannot seek from end: %w", errdefs.ErrUnavailable)
                }
                abs = hrs.size + offset
        default:
                return 0, fmt.Errorf("Fetcher.Seek: invalid whence: %w", errdefs.ErrInvalidArgument)
        }

        if abs < 0 {
                return 0, fmt.Errorf("Fetcher.Seek: negative offset: %w", errdefs.ErrInvalidArgument)
        }

        if abs != hrs.offset {
                if hrs.rc != nil {
                        if err := hrs.rc.Close(); err != nil {
                                log.L.WithError(err).Error("Fetcher.Seek: failed to close ReadCloser")
                        }

                        hrs.rc = nil
                }

                hrs.offset = abs
        }

        return hrs.offset, nil
}

func (hrs *httpReadSeeker) reader() (io.Reader, error) {
        if hrs.rc != nil {
                return hrs.rc, nil
        }

        if hrs.size == -1 || hrs.offset < hrs.size {
                // only try to reopen the body request if we are seeking to a value
                // less than the actual size.
                if hrs.open == nil {
                        return nil, fmt.Errorf("cannot open: %w", errdefs.ErrNotImplemented)
                }

                rc, err := hrs.open(hrs.offset)
                if err != nil {
                        return nil, fmt.Errorf("httpReadSeeker: failed open: %w", err)
                }

                if hrs.rc != nil {
                        if err := hrs.rc.Close(); err != nil {
                                log.L.WithError(err).Error("httpReadSeeker: failed to close ReadCloser")
                        }
                }
                hrs.rc = rc
        } else {
                // There is an edge case here where offset == size of the content. If
                // we seek, we will probably get an error for content that cannot be
                // sought (?). In that case, we should err on committing the content,
                // as the length is already satisfied but we just return the empty
                // reader instead.

                hrs.rc = io.NopCloser(bytes.NewReader([]byte{}))
        }

        return hrs.rc, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "errors"
        "fmt"
        "io"
        "net/http"
        "net/url"
        "path"
        "strings"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        remoteserrors "github.com/containerd/containerd/v2/core/remotes/errors"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type dockerPusher struct {
        *dockerBase
        object string

        // TODO: namespace tracker
        tracker StatusTracker
}

// Writer implements Ingester API of content store. This allows the client
// to receive ErrUnavailable when there is already an on-going upload.
// Note that the tracker MUST implement StatusTrackLocker interface to avoid
// race condition on StatusTracker.
func (p dockerPusher) Writer(ctx context.Context, opts ...content.WriterOpt) (content.Writer, error) {
        var wOpts content.WriterOpts
        for _, opt := range opts {
                if err := opt(&wOpts); err != nil {
                        return nil, err
                }
        }
        if wOpts.Ref == "" {
                return nil, fmt.Errorf("ref must not be empty: %w", errdefs.ErrInvalidArgument)
        }
        return p.push(ctx, wOpts.Desc, wOpts.Ref, true)
}

func (p dockerPusher) Push(ctx context.Context, desc ocispec.Descriptor) (content.Writer, error) {
        return p.push(ctx, desc, remotes.MakeRefKey(ctx, desc), false)
}

func (p dockerPusher) push(ctx context.Context, desc ocispec.Descriptor, ref string, unavailableOnFail bool) (content.Writer, error) {
        if l, ok := p.tracker.(StatusTrackLocker); ok {
                l.Lock(ref)
                defer l.Unlock(ref)
        }
        ctx, err := ContextWithRepositoryScope(ctx, p.refspec, true)
        if err != nil {
                return nil, err
        }
        status, err := p.tracker.GetStatus(ref)
        if err == nil {
                if status.Committed && status.Offset == status.Total {
                        return nil, fmt.Errorf("ref %v: %w", ref, errdefs.ErrAlreadyExists)
                }
                if unavailableOnFail && status.ErrClosed == nil {
                        // Another push of this ref is happening elsewhere. The rest of function
                        // will continue only when `errdefs.IsNotFound(err) == true` (i.e. there
                        // is no actively-tracked ref already).
                        return nil, fmt.Errorf("push is on-going: %w", errdefs.ErrUnavailable)
                }
                // TODO: Handle incomplete status
        } else if !errdefs.IsNotFound(err) {
                return nil, fmt.Errorf("failed to get status: %w", err)
        }

        hosts := p.filterHosts(HostCapabilityPush)
        if len(hosts) == 0 {
                return nil, fmt.Errorf("no push hosts: %w", errdefs.ErrNotFound)
        }

        var (
                isManifest bool
                existCheck []string
                host       = hosts[0]
        )

        if images.IsManifestType(desc.MediaType) || images.IsIndexType(desc.MediaType) {
                isManifest = true
                existCheck = getManifestPath(p.object, desc.Digest)
        } else {
                existCheck = []string{"blobs", desc.Digest.String()}
        }

        req := p.request(host, http.MethodHead, existCheck...)
        req.header.Set("Accept", strings.Join([]string{desc.MediaType, `*/*`}, ", "))

        log.G(ctx).WithField("url", req.String()).Debugf("checking and pushing to")

        resp, err := req.doWithRetries(ctx, nil)
        if err != nil {
                if !errors.Is(err, ErrInvalidAuthorization) {
                        return nil, err
                }
                log.G(ctx).WithError(err).Debugf("Unable to check existence, continuing with push")
        } else {
                if resp.StatusCode == http.StatusOK {
                        var exists bool
                        if isManifest && existCheck[1] != desc.Digest.String() {
                                dgstHeader := digest.Digest(resp.Header.Get("Docker-Content-Digest"))
                                if dgstHeader == desc.Digest {
                                        exists = true
                                }
                        } else {
                                exists = true
                        }

                        if exists {
                                p.tracker.SetStatus(ref, Status{
                                        Committed: true,
                                        PushStatus: PushStatus{
                                                Exists: true,
                                        },
                                        Status: content.Status{
                                                Ref: ref,
                                                // TODO: Set updated time?
                                        },
                                })
                                resp.Body.Close()
                                return nil, fmt.Errorf("content %v on remote: %w", desc.Digest, errdefs.ErrAlreadyExists)
                        }
                } else if resp.StatusCode != http.StatusNotFound {
                        err := remoteserrors.NewUnexpectedStatusErr(resp)
                        log.G(ctx).WithField("resp", resp).WithField("body", string(err.(remoteserrors.ErrUnexpectedStatus).Body)).Debug("unexpected response")
                        resp.Body.Close()
                        return nil, err
                }
                resp.Body.Close()
        }

        if isManifest {
                putPath := getManifestPath(p.object, desc.Digest)
                req = p.request(host, http.MethodPut, putPath...)
                req.header.Add("Content-Type", desc.MediaType)
        } else {
                // Start upload request
                req = p.request(host, http.MethodPost, "blobs", "uploads/")

                mountedFrom := ""
                var resp *http.Response
                if fromRepo := selectRepositoryMountCandidate(p.refspec, desc.Annotations); fromRepo != "" {
                        preq := requestWithMountFrom(req, desc.Digest.String(), fromRepo)
                        pctx := ContextWithAppendPullRepositoryScope(ctx, fromRepo)

                        // NOTE: the fromRepo might be private repo and
                        // auth service still can grant token without error.
                        // but the post request will fail because of 401.
                        //
                        // for the private repo, we should remove mount-from
                        // query and send the request again.
                        resp, err = preq.doWithRetries(pctx, nil)
                        if err != nil {
                                return nil, err
                        }

                        switch resp.StatusCode {
                        case http.StatusUnauthorized:
                                log.G(ctx).Debugf("failed to mount from repository %s", fromRepo)

                                resp.Body.Close()
                                resp = nil
                        case http.StatusCreated:
                                mountedFrom = path.Join(p.refspec.Hostname(), fromRepo)
                        }
                }

                if resp == nil {
                        resp, err = req.doWithRetries(ctx, nil)
                        if err != nil {
                                if errors.Is(err, ErrInvalidAuthorization) {
                                        return nil, fmt.Errorf("push access denied, repository does not exist or may require authorization: %w", err)
                                }
                                return nil, err
                        }
                }
                defer resp.Body.Close()

                switch resp.StatusCode {
                case http.StatusOK, http.StatusAccepted, http.StatusNoContent:
                case http.StatusCreated:
                        p.tracker.SetStatus(ref, Status{
                                Committed: true,
                                PushStatus: PushStatus{
                                        MountedFrom: mountedFrom,
                                },
                                Status: content.Status{
                                        Ref:    ref,
                                        Total:  desc.Size,
                                        Offset: desc.Size,
                                },
                        })
                        return nil, fmt.Errorf("content %v on remote: %w", desc.Digest, errdefs.ErrAlreadyExists)
                default:
                        err := remoteserrors.NewUnexpectedStatusErr(resp)
                        log.G(ctx).WithField("resp", resp).WithField("body", string(err.(remoteserrors.ErrUnexpectedStatus).Body)).Debug("unexpected response")
                        return nil, err
                }

                var (
                        location = resp.Header.Get("Location")
                        lurl     *url.URL
                        lhost    = host
                )
                // Support paths without host in location
                if strings.HasPrefix(location, "/") {
                        lurl, err = url.Parse(lhost.Scheme + "://" + lhost.Host + location)
                        if err != nil {
                                return nil, fmt.Errorf("unable to parse location %v: %w", location, err)
                        }
                } else {
                        if !strings.Contains(location, "://") {
                                location = lhost.Scheme + "://" + location
                        }
                        lurl, err = url.Parse(location)
                        if err != nil {
                                return nil, fmt.Errorf("unable to parse location %v: %w", location, err)
                        }

                        if lurl.Host != lhost.Host || lhost.Scheme != lurl.Scheme {
                                lhost.Scheme = lurl.Scheme
                                lhost.Host = lurl.Host

                                // Check if different than what was requested, accounting for fallback in the transport layer
                                requested := resp.Request.URL
                                if requested.Host != lhost.Host || requested.Scheme != lhost.Scheme {
                                        // Strip authorizer if change to host or scheme
                                        lhost.Authorizer = nil
                                        log.G(ctx).WithField("host", lhost.Host).WithField("scheme", lhost.Scheme).Debug("upload changed destination, authorizer removed")
                                }
                        }
                }
                q := lurl.Query()
                q.Add("digest", desc.Digest.String())

                req = p.request(lhost, http.MethodPut)
                req.header.Set("Content-Type", "application/octet-stream")
                req.path = lurl.Path + "?" + q.Encode()
        }
        p.tracker.SetStatus(ref, Status{
                Status: content.Status{
                        Ref:       ref,
                        Total:     desc.Size,
                        Expected:  desc.Digest,
                        StartedAt: time.Now(),
                },
        })

        // TODO: Support chunked upload

        pushw := newPushWriter(p.dockerBase, ref, desc.Digest, p.tracker, isManifest)

        req.body = func() (io.ReadCloser, error) {
                pr, pw := io.Pipe()
                pushw.setPipe(pw)
                return pr, nil
        }
        req.size = desc.Size

        go func() {
                resp, err := req.doWithRetries(ctx, nil)
                if err != nil {
                        pushw.setError(err)
                        return
                }

                switch resp.StatusCode {
                case http.StatusOK, http.StatusCreated, http.StatusNoContent:
                default:
                        err := remoteserrors.NewUnexpectedStatusErr(resp)
                        log.G(ctx).WithField("resp", resp).WithField("body", string(err.(remoteserrors.ErrUnexpectedStatus).Body)).Debug("unexpected response")
                        pushw.setError(err)
                        return
                }
                pushw.setResponse(resp)
        }()

        return pushw, nil
}

func getManifestPath(object string, dgst digest.Digest) []string {
        if i := strings.IndexByte(object, '@'); i >= 0 {
                if object[i+1:] != dgst.String() {
                        // use digest, not tag
                        object = ""
                } else {
                        // strip @<digest> for registry path to make tag
                        object = object[:i]
                }

        }

        if object == "" {
                return []string{"manifests", dgst.String()}
        }

        return []string{"manifests", object}
}

type pushWriter struct {
        base *dockerBase
        ref  string

        pipe *io.PipeWriter

        done      chan struct{}
        closeOnce sync.Once

        pipeC chan *io.PipeWriter
        respC chan *http.Response
        errC  chan error

        isManifest bool

        expected digest.Digest
        tracker  StatusTracker
}

func newPushWriter(db *dockerBase, ref string, expected digest.Digest, tracker StatusTracker, isManifest bool) *pushWriter {
        // Initialize and create response
        return &pushWriter{
                base:       db,
                ref:        ref,
                expected:   expected,
                tracker:    tracker,
                pipeC:      make(chan *io.PipeWriter, 1),
                respC:      make(chan *http.Response, 1),
                errC:       make(chan error, 1),
                done:       make(chan struct{}),
                isManifest: isManifest,
        }
}

func (pw *pushWriter) setPipe(p *io.PipeWriter) {
        select {
        case <-pw.done:
        case pw.pipeC <- p:
        }
}

func (pw *pushWriter) setError(err error) {
        select {
        case <-pw.done:
        case pw.errC <- err:
        }
}

func (pw *pushWriter) setResponse(resp *http.Response) {
        select {
        case <-pw.done:
        case pw.respC <- resp:
        }
}

func (pw *pushWriter) replacePipe(p *io.PipeWriter) error {
        if pw.pipe == nil {
                pw.pipe = p
                return nil
        }

        pw.pipe.CloseWithError(content.ErrReset)
        pw.pipe = p

        // If content has already been written, the bytes
        // cannot be written again and the caller must reset
        status, err := pw.tracker.GetStatus(pw.ref)
        if err != nil {
                return err
        }
        status.Offset = 0
        status.UpdatedAt = time.Now()
        pw.tracker.SetStatus(pw.ref, status)
        return content.ErrReset
}

func (pw *pushWriter) Write(p []byte) (n int, err error) {
        status, err := pw.tracker.GetStatus(pw.ref)
        if err != nil {
                return n, err
        }

        if pw.pipe == nil {
                select {
                case <-pw.done:
                        return 0, io.ErrClosedPipe
                case p := <-pw.pipeC:
                        pw.replacePipe(p)
                }
        } else {
                select {
                case <-pw.done:
                        return 0, io.ErrClosedPipe
                case p := <-pw.pipeC:
                        return 0, pw.replacePipe(p)
                default:
                }
        }

        n, err = pw.pipe.Write(p)
        if errors.Is(err, io.ErrClosedPipe) {
                // if the pipe is closed, we might have the original error on the error
                // channel - so we should try and get it
                select {
                case <-pw.done:
                case err = <-pw.errC:
                        pw.Close()
                case p := <-pw.pipeC:
                        return 0, pw.replacePipe(p)
                case resp := <-pw.respC:
                        pw.setResponse(resp)
                }
        }
        status.Offset += int64(n)
        status.UpdatedAt = time.Now()
        pw.tracker.SetStatus(pw.ref, status)
        return
}

func (pw *pushWriter) Close() error {
        // Ensure pipeC is closed but handle `Close()` being
        // called multiple times without panicking
        pw.closeOnce.Do(func() {
                close(pw.done)
        })
        if pw.pipe != nil {
                status, err := pw.tracker.GetStatus(pw.ref)
                if err == nil && !status.Committed {
                        // Closing an incomplete writer. Record this as an error so that following write can retry it.
                        status.ErrClosed = errors.New("closed incomplete writer")
                        pw.tracker.SetStatus(pw.ref, status)
                }
                return pw.pipe.Close()
        }
        return nil
}

func (pw *pushWriter) Status() (content.Status, error) {
        status, err := pw.tracker.GetStatus(pw.ref)
        if err != nil {
                return content.Status{}, err
        }
        return status.Status, nil

}

func (pw *pushWriter) Digest() digest.Digest {
        // TODO: Get rid of this function?
        return pw.expected
}

func (pw *pushWriter) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) error {
        // Check whether read has already thrown an error
        if _, err := pw.pipe.Write([]byte{}); err != nil && !errors.Is(err, io.ErrClosedPipe) {
                return fmt.Errorf("pipe error before commit: %w", err)
        }

        if err := pw.pipe.Close(); err != nil {
                return err
        }
        // TODO: timeout waiting for response
        var resp *http.Response
        select {
        case <-pw.done:
                return io.ErrClosedPipe
        case err := <-pw.errC:
                pw.Close()
                return err
        case resp = <-pw.respC:
                defer resp.Body.Close()
        case p := <-pw.pipeC:
                // check whether the pipe has changed in the commit, because sometimes Write
                // can complete successfully, but the pipe may have changed. In that case, the
                // content needs to be reset.
                return pw.replacePipe(p)
        }

        // 201 is specified return status, some registries return
        // 200, 202 or 204.
        switch resp.StatusCode {
        case http.StatusOK, http.StatusCreated, http.StatusNoContent, http.StatusAccepted:
        default:
                return remoteserrors.NewUnexpectedStatusErr(resp)
        }

        status, err := pw.tracker.GetStatus(pw.ref)
        if err != nil {
                return fmt.Errorf("failed to get status: %w", err)
        }

        if size > 0 && size != status.Offset {
                return fmt.Errorf("unexpected size %d, expected %d", status.Offset, size)
        }

        if expected == "" {
                expected = status.Expected
        }

        actual, err := digest.Parse(resp.Header.Get("Docker-Content-Digest"))
        if err != nil {
                return fmt.Errorf("invalid content digest in response: %w", err)
        }

        if actual != expected {
                return fmt.Errorf("got digest %s, expected %s", actual, expected)
        }

        status.Committed = true
        status.UpdatedAt = time.Now()
        pw.tracker.SetStatus(pw.ref, status)

        return nil
}

func (pw *pushWriter) Truncate(size int64) error {
        // TODO: if blob close request and start new request at offset
        // TODO: always error on manifest
        return errors.New("cannot truncate remote upload")
}

func requestWithMountFrom(req *request, mount, from string) *request {
        creq := *req

        sep := "?"
        if strings.Contains(creq.path, sep) {
                sep = "&"
        }

        creq.path = creq.path + sep + "mount=" + mount + "&from=" + from

        return &creq
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "errors"
        "net"
        "net/http"
)

// HostCapabilities represent the capabilities of the registry
// host. This also represents the set of operations for which
// the registry host may be trusted to perform.
//
// For example pushing is a capability which should only be
// performed on an upstream source, not a mirror.
// Resolving (the process of converting a name into a digest)
// must be considered a trusted operation and only done by
// a host which is trusted (or more preferably by secure process
// which can prove the provenance of the mapping). A public
// mirror should never be trusted to do a resolve action.
//
// | Registry Type    | Pull | Resolve | Push |
// |------------------|------|---------|------|
// | Public Registry  | yes  | yes     | yes  |
// | Private Registry | yes  | yes     | yes  |
// | Public Mirror    | yes  | no      | no   |
// | Private Mirror   | yes  | yes     | no   |
type HostCapabilities uint8

const (
        // HostCapabilityPull represents the capability to fetch manifests
        // and blobs by digest
        HostCapabilityPull HostCapabilities = 1 << iota

        // HostCapabilityResolve represents the capability to fetch manifests
        // by name
        HostCapabilityResolve

        // HostCapabilityPush represents the capability to push blobs and
        // manifests
        HostCapabilityPush

        // Reserved for future capabilities (i.e. search, catalog, remove)
)

// Has checks whether the capabilities list has the provide capability
func (c HostCapabilities) Has(t HostCapabilities) bool {
        return c&t == t
}

// RegistryHost represents a complete configuration for a registry
// host, representing the capabilities, authorizations, connection
// configuration, and location.
type RegistryHost struct {
        Client       *http.Client
        Authorizer   Authorizer
        Host         string
        Scheme       string
        Path         string
        Capabilities HostCapabilities
        Header       http.Header
}

func (h RegistryHost) isProxy(refhost string) bool {
        if refhost != h.Host {
                if refhost != "docker.io" || h.Host != "registry-1.docker.io" {
                        return true
                }
        }
        return false
}

// RegistryHosts fetches the registry hosts for a given namespace,
// provided by the host component of an distribution image reference.
type RegistryHosts func(string) ([]RegistryHost, error)

// Registries joins multiple registry configuration functions, using the same
// order as provided within the arguments. When an empty registry configuration
// is returned with a nil error, the next function will be called.
// NOTE: This function will not join configurations, as soon as a non-empty
// configuration is returned from a configuration function, it will be returned
// to the caller.
func Registries(registries ...RegistryHosts) RegistryHosts {
        return func(host string) ([]RegistryHost, error) {
                for _, registry := range registries {
                        config, err := registry(host)
                        if err != nil {
                                return config, err
                        }
                        if len(config) > 0 {
                                return config, nil
                        }
                }
                return nil, nil
        }
}

type registryOpts struct {
        authorizer Authorizer
        plainHTTP  func(string) (bool, error)
        host       func(string) (string, error)
        client     *http.Client
}

// RegistryOpt defines a registry default option
type RegistryOpt func(*registryOpts)

// WithPlainHTTP configures registries to use plaintext http scheme
// for the provided host match function.
func WithPlainHTTP(f func(string) (bool, error)) RegistryOpt {
        return func(opts *registryOpts) {
                opts.plainHTTP = f
        }
}

// WithAuthorizer configures the default authorizer for a registry
func WithAuthorizer(a Authorizer) RegistryOpt {
        return func(opts *registryOpts) {
                opts.authorizer = a
        }
}

// WithHostTranslator defines the default translator to use for registry hosts
func WithHostTranslator(h func(string) (string, error)) RegistryOpt {
        return func(opts *registryOpts) {
                opts.host = h
        }
}

// WithClient configures the default http client for a registry
func WithClient(c *http.Client) RegistryOpt {
        return func(opts *registryOpts) {
                opts.client = c
        }
}

// ConfigureDefaultRegistries is used to create a default configuration for
// registries. For more advanced configurations or per-domain setups,
// the RegistryHosts interface should be used directly.
// NOTE: This function will always return a non-empty value or error
func ConfigureDefaultRegistries(ropts ...RegistryOpt) RegistryHosts {
        var opts registryOpts
        for _, opt := range ropts {
                opt(&opts)
        }

        return func(host string) ([]RegistryHost, error) {
                config := RegistryHost{
                        Client:       opts.client,
                        Authorizer:   opts.authorizer,
                        Host:         host,
                        Scheme:       "https",
                        Path:         "/v2",
                        Capabilities: HostCapabilityPull | HostCapabilityResolve | HostCapabilityPush,
                }

                if config.Client == nil {
                        config.Client = http.DefaultClient
                }

                if opts.plainHTTP != nil {
                        match, err := opts.plainHTTP(host)
                        if err != nil {
                                return nil, err
                        }
                        if match {
                                config.Scheme = "http"
                        }
                }

                if opts.host != nil {
                        var err error
                        config.Host, err = opts.host(config.Host)
                        if err != nil {
                                return nil, err
                        }
                } else if host == "docker.io" {
                        config.Host = "registry-1.docker.io"
                }

                return []RegistryHost{config}, nil
        }
}

// MatchAllHosts is a host match function which is always true.
func MatchAllHosts(string) (bool, error) {
        return true, nil
}

// MatchLocalhost is a host match function which returns true for
// localhost.
//
// Note: this does not handle matching of ip addresses in octal,
// decimal or hex form.
func MatchLocalhost(host string) (bool, error) {
        switch {
        case host == "::1":
                return true, nil
        case host == "[::1]":
                return true, nil
        }
        h, p, err := net.SplitHostPort(host)

        // addrError helps distinguish between errors of form
        // "no colon in address" and "too many colons in address".
        // The former is fine as the host string need not have a
        // port. Latter needs to be handled.
        addrError := &net.AddrError{
                Err:  "missing port in address",
                Addr: host,
        }
        if err != nil {
                if err.Error() != addrError.Error() {
                        return false, err
                }
                // host string without any port specified
                h = host
        } else if len(p) == 0 {
                return false, errors.New("invalid host name format")
        }

        // use ipv4 dotted decimal for further checking
        if h == "localhost" {
                h = "127.0.0.1"
        }
        ip := net.ParseIP(h)

        return ip.IsLoopback(), nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "crypto/tls"
        "errors"
        "fmt"
        "io"
        "net"
        "net/http"
        "net/url"
        "path"
        "strings"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker/schema1" //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
        remoteerrors "github.com/containerd/containerd/v2/core/remotes/errors"
        "github.com/containerd/containerd/v2/pkg/reference"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

var (
        // ErrInvalidAuthorization is used when credentials are passed to a server but
        // those credentials are rejected.
        ErrInvalidAuthorization = errors.New("authorization failed")

        // MaxManifestSize represents the largest size accepted from a registry
        // during resolution. Larger manifests may be accepted using a
        // resolution method other than the registry.
        //
        // NOTE: The max supported layers by some runtimes is 128 and individual
        // layers will not contribute more than 256 bytes, making a
        // reasonable limit for a large image manifests of 32K bytes.
        // 4M bytes represents a much larger upper bound for images which may
        // contain large annotations or be non-images. A proper manifest
        // design puts large metadata in subobjects, as is consistent the
        // intent of the manifest design.
        MaxManifestSize int64 = 4 * 1048 * 1048
)

// Authorizer is used to authorize HTTP requests based on 401 HTTP responses.
// An Authorizer is responsible for caching tokens or credentials used by
// requests.
type Authorizer interface {
        // Authorize sets the appropriate `Authorization` header on the given
        // request.
        //
        // If no authorization is found for the request, the request remains
        // unmodified. It may also add an `Authorization` header as
        //  "bearer <some bearer token>"
        //  "basic <base64 encoded credentials>"
        //
        // It may return remotes/errors.ErrUnexpectedStatus, which for example,
        // can be used by the caller to find out the status code returned by the registry.
        Authorize(context.Context, *http.Request) error

        // AddResponses adds a 401 response for the authorizer to consider when
        // authorizing requests. The last response should be unauthorized and
        // the previous requests are used to consider redirects and retries
        // that may have led to the 401.
        //
        // If response is not handled, returns `ErrNotImplemented`
        AddResponses(context.Context, []*http.Response) error
}

// ResolverOptions are used to configured a new Docker register resolver
type ResolverOptions struct {
        // Hosts returns registry host configurations for a namespace.
        Hosts RegistryHosts

        // Headers are the HTTP request header fields sent by the resolver
        Headers http.Header

        // Tracker is used to track uploads to the registry. This is used
        // since the registry does not have upload tracking and the existing
        // mechanism for getting blob upload status is expensive.
        Tracker StatusTracker

        // Authorizer is used to authorize registry requests
        //
        // Deprecated: use Hosts.
        Authorizer Authorizer

        // Credentials provides username and secret given a host.
        // If username is empty but a secret is given, that secret
        // is interpreted as a long lived token.
        //
        // Deprecated: use Hosts.
        Credentials func(string) (string, string, error)

        // Host provides the hostname given a namespace.
        //
        // Deprecated: use Hosts.
        Host func(string) (string, error)

        // PlainHTTP specifies to use plain http and not https
        //
        // Deprecated: use Hosts.
        PlainHTTP bool

        // Client is the http client to used when making registry requests
        //
        // Deprecated: use Hosts.
        Client *http.Client
}

// DefaultHost is the default host function.
func DefaultHost(ns string) (string, error) {
        if ns == "docker.io" {
                return "registry-1.docker.io", nil
        }
        return ns, nil
}

type dockerResolver struct {
        hosts         RegistryHosts
        header        http.Header
        resolveHeader http.Header
        tracker       StatusTracker
}

// NewResolver returns a new resolver to a Docker registry
func NewResolver(options ResolverOptions) remotes.Resolver {
        if options.Tracker == nil {
                options.Tracker = NewInMemoryTracker()
        }

        if options.Headers == nil {
                options.Headers = make(http.Header)
        } else {
                // make a copy of the headers to avoid race due to concurrent map write
                options.Headers = options.Headers.Clone()
        }
        if _, ok := options.Headers["User-Agent"]; !ok {
                options.Headers.Set("User-Agent", "containerd/"+version.Version)
        }

        resolveHeader := http.Header{}
        if _, ok := options.Headers["Accept"]; !ok {
                // set headers for all the types we support for resolution.
                resolveHeader.Set("Accept", strings.Join([]string{
                        images.MediaTypeDockerSchema2Manifest,
                        images.MediaTypeDockerSchema2ManifestList,
                        ocispec.MediaTypeImageManifest,
                        ocispec.MediaTypeImageIndex, "*/*",
                }, ", "))
        } else {
                resolveHeader["Accept"] = options.Headers["Accept"]
                delete(options.Headers, "Accept")
        }

        if options.Hosts == nil {
                opts := []RegistryOpt{}
                if options.Host != nil {
                        opts = append(opts, WithHostTranslator(options.Host))
                }

                if options.Authorizer == nil {
                        options.Authorizer = NewDockerAuthorizer(
                                WithAuthClient(options.Client),
                                WithAuthHeader(options.Headers),
                                WithAuthCreds(options.Credentials))
                }
                opts = append(opts, WithAuthorizer(options.Authorizer))

                if options.Client != nil {
                        opts = append(opts, WithClient(options.Client))
                }
                if options.PlainHTTP {
                        opts = append(opts, WithPlainHTTP(MatchAllHosts))
                } else {
                        opts = append(opts, WithPlainHTTP(MatchLocalhost))
                }
                options.Hosts = ConfigureDefaultRegistries(opts...)
        }
        return &dockerResolver{
                hosts:         options.Hosts,
                header:        options.Headers,
                resolveHeader: resolveHeader,
                tracker:       options.Tracker,
        }
}

func getManifestMediaType(resp *http.Response) string {
        // Strip encoding data (manifests should always be ascii JSON)
        contentType := resp.Header.Get("Content-Type")
        if sp := strings.IndexByte(contentType, ';'); sp != -1 {
                contentType = contentType[0:sp]
        }

        // As of Apr 30 2019 the registry.access.redhat.com registry does not specify
        // the content type of any data but uses schema1 manifests.
        if contentType == "text/plain" {
                contentType = images.MediaTypeDockerSchema1Manifest
        }
        return contentType
}

type countingReader struct {
        reader    io.Reader
        bytesRead int64
}

func (r *countingReader) Read(p []byte) (int, error) {
        n, err := r.reader.Read(p)
        r.bytesRead += int64(n)
        return n, err
}

var _ remotes.Resolver = &dockerResolver{}

func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocispec.Descriptor, error) {
        base, err := r.resolveDockerBase(ref)
        if err != nil {
                return "", ocispec.Descriptor{}, err
        }
        refspec := base.refspec
        if refspec.Object == "" {
                return "", ocispec.Descriptor{}, reference.ErrObjectRequired
        }

        var (
                paths [][]string
                dgst  = refspec.Digest()
                caps  = HostCapabilityPull
        )

        if dgst != "" {
                if err := dgst.Validate(); err != nil {
                        // need to fail here, since we can't actually resolve the invalid
                        // digest.
                        return "", ocispec.Descriptor{}, err
                }

                // turns out, we have a valid digest, make a url.
                paths = append(paths, []string{"manifests", dgst.String()})

                // fallback to blobs on not found.
                paths = append(paths, []string{"blobs", dgst.String()})
        } else {
                // Add
                paths = append(paths, []string{"manifests", refspec.Object})
                caps |= HostCapabilityResolve
        }

        hosts := base.filterHosts(caps)
        if len(hosts) == 0 {
                return "", ocispec.Descriptor{}, fmt.Errorf("no resolve hosts: %w", errdefs.ErrNotFound)
        }

        ctx, err = ContextWithRepositoryScope(ctx, refspec, false)
        if err != nil {
                return "", ocispec.Descriptor{}, err
        }

        var (
                // firstErr is the most relevant error encountered during resolution.
                // We use this to determine the error to return, making sure that the
                // error created furthest through the resolution process is returned.
                firstErr         error
                firstErrPriority int
        )
        for _, u := range paths {
                for _, host := range hosts {
                        ctx := log.WithLogger(ctx, log.G(ctx).WithField("host", host.Host))

                        req := base.request(host, http.MethodHead, u...)
                        if err := req.addNamespace(base.refspec.Hostname()); err != nil {
                                return "", ocispec.Descriptor{}, err
                        }

                        for key, value := range r.resolveHeader {
                                req.header[key] = append(req.header[key], value...)
                        }

                        log.G(ctx).Debug("resolving")
                        resp, err := req.doWithRetries(ctx, nil)
                        if err != nil {
                                if errors.Is(err, ErrInvalidAuthorization) {
                                        err = fmt.Errorf("pull access denied, repository does not exist or may require authorization: %w", err)
                                }
                                if firstErrPriority < 1 {
                                        firstErr = err
                                        firstErrPriority = 1
                                }
                                log.G(ctx).WithError(err).Info("trying next host")
                                continue // try another host
                        }
                        resp.Body.Close() // don't care about body contents.

                        if resp.StatusCode > 299 {
                                if resp.StatusCode == http.StatusNotFound {
                                        if firstErrPriority < 2 {
                                                firstErr = fmt.Errorf("%s: %w", ref, errdefs.ErrNotFound)
                                                firstErrPriority = 2
                                        }
                                        log.G(ctx).Info("trying next host - response was http.StatusNotFound")
                                        continue
                                }
                                if resp.StatusCode > 399 {
                                        if firstErrPriority < 3 {
                                                firstErr = remoteerrors.NewUnexpectedStatusErr(resp)
                                                firstErrPriority = 3
                                        }
                                        log.G(ctx).Infof("trying next host - response was %s", resp.Status)
                                        continue // try another host
                                }
                                return "", ocispec.Descriptor{}, remoteerrors.NewUnexpectedStatusErr(resp)
                        }
                        size := resp.ContentLength
                        contentType := getManifestMediaType(resp)

                        // if no digest was provided, then only a resolve
                        // trusted registry was contacted, in this case use
                        // the digest header (or content from GET)
                        if dgst == "" {
                                // this is the only point at which we trust the registry. we use the
                                // content headers to assemble a descriptor for the name. when this becomes
                                // more robust, we mostly get this information from a secure trust store.
                                dgstHeader := digest.Digest(resp.Header.Get("Docker-Content-Digest"))

                                if dgstHeader != "" && size != -1 {
                                        if err := dgstHeader.Validate(); err != nil {
                                                return "", ocispec.Descriptor{}, fmt.Errorf("%q in header not a valid digest: %w", dgstHeader, err)
                                        }
                                        dgst = dgstHeader
                                }
                        }
                        if dgst == "" || size == -1 {
                                log.G(ctx).Debug("no Docker-Content-Digest header, fetching manifest instead")

                                req = base.request(host, http.MethodGet, u...)
                                if err := req.addNamespace(base.refspec.Hostname()); err != nil {
                                        return "", ocispec.Descriptor{}, err
                                }

                                for key, value := range r.resolveHeader {
                                        req.header[key] = append(req.header[key], value...)
                                }

                                resp, err := req.doWithRetries(ctx, nil)
                                if err != nil {
                                        return "", ocispec.Descriptor{}, err
                                }

                                bodyReader := countingReader{reader: resp.Body}

                                contentType = getManifestMediaType(resp)
                                err = func() error {
                                        defer resp.Body.Close()
                                        if dgst != "" {
                                                _, err = io.Copy(io.Discard, &bodyReader)
                                                return err
                                        }

                                        if contentType == images.MediaTypeDockerSchema1Manifest {
                                                b, err := schema1.ReadStripSignature(&bodyReader)
                                                if err != nil {
                                                        return err
                                                }

                                                dgst = digest.FromBytes(b)
                                                return nil
                                        }

                                        dgst, err = digest.FromReader(&bodyReader)
                                        return err
                                }()
                                if err != nil {
                                        return "", ocispec.Descriptor{}, err
                                }
                                size = bodyReader.bytesRead
                        }
                        // Prevent resolving to excessively large manifests
                        if size > MaxManifestSize {
                                if firstErrPriority < 4 {
                                        firstErr = fmt.Errorf("rejecting %d byte manifest for %s: %w", size, ref, errdefs.ErrNotFound)
                                        firstErrPriority = 4
                                }
                                continue
                        }

                        desc := ocispec.Descriptor{
                                Digest:    dgst,
                                MediaType: contentType,
                                Size:      size,
                        }

                        log.G(ctx).WithField("desc.digest", desc.Digest).Debug("resolved")
                        return ref, desc, nil
                }
        }

        // If above loop terminates without return or error, then no registries
        // were provided.
        if firstErr == nil {
                firstErr = fmt.Errorf("%s: %w", ref, errdefs.ErrNotFound)
        }

        return "", ocispec.Descriptor{}, firstErr
}

func (r *dockerResolver) Fetcher(ctx context.Context, ref string) (remotes.Fetcher, error) {
        base, err := r.resolveDockerBase(ref)
        if err != nil {
                return nil, err
        }

        return dockerFetcher{
                dockerBase: base,
        }, nil
}

func (r *dockerResolver) Pusher(ctx context.Context, ref string) (remotes.Pusher, error) {
        base, err := r.resolveDockerBase(ref)
        if err != nil {
                return nil, err
        }

        return dockerPusher{
                dockerBase: base,
                object:     base.refspec.Object,
                tracker:    r.tracker,
        }, nil
}

func (r *dockerResolver) resolveDockerBase(ref string) (*dockerBase, error) {
        refspec, err := reference.Parse(ref)
        if err != nil {
                return nil, err
        }

        return r.base(refspec)
}

type dockerBase struct {
        refspec    reference.Spec
        repository string
        hosts      []RegistryHost
        header     http.Header
}

func (r *dockerResolver) base(refspec reference.Spec) (*dockerBase, error) {
        host := refspec.Hostname()
        hosts, err := r.hosts(host)
        if err != nil {
                return nil, err
        }
        return &dockerBase{
                refspec:    refspec,
                repository: strings.TrimPrefix(refspec.Locator, host+"/"),
                hosts:      hosts,
                header:     r.header,
        }, nil
}

func (r *dockerBase) filterHosts(caps HostCapabilities) (hosts []RegistryHost) {
        for _, host := range r.hosts {
                if host.Capabilities.Has(caps) {
                        hosts = append(hosts, host)
                }
        }
        return
}

func (r *dockerBase) request(host RegistryHost, method string, ps ...string) *request {
        header := r.header.Clone()
        if header == nil {
                header = http.Header{}
        }

        for key, value := range host.Header {
                header[key] = append(header[key], value...)
        }
        parts := append([]string{"/", host.Path, r.repository}, ps...)
        p := path.Join(parts...)
        // Join strips trailing slash, re-add ending "/" if included
        if len(parts) > 0 && strings.HasSuffix(parts[len(parts)-1], "/") {
                p = p + "/"
        }
        return &request{
                method: method,
                path:   p,
                header: header,
                host:   host,
        }
}

func (r *request) authorize(ctx context.Context, req *http.Request) error {
        // Check if has header for host
        if r.host.Authorizer != nil {
                if err := r.host.Authorizer.Authorize(ctx, req); err != nil {
                        return err
                }
        }

        return nil
}

func (r *request) addNamespace(ns string) (err error) {
        if !r.host.isProxy(ns) {
                return nil
        }
        var q url.Values
        // Parse query
        if i := strings.IndexByte(r.path, '?'); i > 0 {
                r.path = r.path[:i+1]
                q, err = url.ParseQuery(r.path[i+1:])
                if err != nil {
                        return
                }
        } else {
                r.path = r.path + "?"
                q = url.Values{}
        }
        q.Add("ns", ns)

        r.path = r.path + q.Encode()

        return
}

type request struct {
        method string
        path   string
        header http.Header
        host   RegistryHost
        body   func() (io.ReadCloser, error)
        size   int64
}

func (r *request) do(ctx context.Context) (*http.Response, error) {
        u := r.host.Scheme + "://" + r.host.Host + r.path
        req, err := http.NewRequestWithContext(ctx, r.method, u, nil)
        if err != nil {
                return nil, err
        }
        if r.header == nil {
                req.Header = http.Header{}
        } else {
                req.Header = r.header.Clone() // headers need to be copied to avoid concurrent map access
        }
        if r.body != nil {
                body, err := r.body()
                if err != nil {
                        return nil, err
                }
                req.Body = body
                req.GetBody = r.body
                if r.size > 0 {
                        req.ContentLength = r.size
                }
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithField("url", u))
        log.G(ctx).WithFields(requestFields(req)).Debug("do request")
        if err := r.authorize(ctx, req); err != nil {
                return nil, fmt.Errorf("failed to authorize: %w", err)
        }

        client := &http.Client{}
        if r.host.Client != nil {
                *client = *r.host.Client
        }
        if client.CheckRedirect == nil {
                client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
                        if len(via) >= 10 {
                                return errors.New("stopped after 10 redirects")
                        }
                        if err := r.authorize(ctx, req); err != nil {
                                return fmt.Errorf("failed to authorize redirect: %w", err)
                        }
                        return nil
                }
        }

        tracing.UpdateHTTPClient(client, tracing.Name("remotes.docker.resolver", "HTTPRequest"))

        resp, err := client.Do(req)
        if err != nil {
                return nil, fmt.Errorf("failed to do request: %w", err)
        }
        log.G(ctx).WithFields(responseFields(resp)).Debug("fetch response received")
        return resp, nil
}

func (r *request) doWithRetries(ctx context.Context, responses []*http.Response) (*http.Response, error) {
        resp, err := r.do(ctx)
        if err != nil {
                return nil, err
        }

        responses = append(responses, resp)
        retry, err := r.retryRequest(ctx, responses)
        if err != nil {
                resp.Body.Close()
                return nil, err
        }
        if retry {
                resp.Body.Close()
                return r.doWithRetries(ctx, responses)
        }
        return resp, err
}

func (r *request) retryRequest(ctx context.Context, responses []*http.Response) (bool, error) {
        if len(responses) > 5 {
                return false, nil
        }
        last := responses[len(responses)-1]
        switch last.StatusCode {
        case http.StatusUnauthorized:
                log.G(ctx).WithField("header", last.Header.Get("WWW-Authenticate")).Debug("Unauthorized")
                if r.host.Authorizer != nil {
                        if err := r.host.Authorizer.AddResponses(ctx, responses); err == nil {
                                return true, nil
                        } else if !errdefs.IsNotImplemented(err) {
                                return false, err
                        }
                }

                return false, nil
        case http.StatusMethodNotAllowed:
                // Support registries which have not properly implemented the HEAD method for
                // manifests endpoint
                if r.method == http.MethodHead && strings.Contains(r.path, "/manifests/") {
                        r.method = http.MethodGet
                        return true, nil
                }
        case http.StatusRequestTimeout, http.StatusTooManyRequests:
                return true, nil
        }

        // TODO: Handle 50x errors accounting for attempt history
        return false, nil
}

func (r *request) String() string {
        return r.host.Scheme + "://" + r.host.Host + r.path
}

func requestFields(req *http.Request) log.Fields {
        fields := map[string]interface{}{
                "request.method": req.Method,
        }
        for k, vals := range req.Header {
                k = strings.ToLower(k)
                if k == "authorization" {
                        continue
                }
                for i, v := range vals {
                        field := "request.header." + k
                        if i > 0 {
                                field = fmt.Sprintf("%s.%d", field, i)
                        }
                        fields[field] = v
                }
        }

        return fields
}

func responseFields(resp *http.Response) log.Fields {
        fields := map[string]interface{}{
                "response.status": resp.Status,
        }
        for k, vals := range resp.Header {
                k = strings.ToLower(k)
                for i, v := range vals {
                        field := "response.header." + k
                        if i > 0 {
                                field = fmt.Sprintf("%s.%d", field, i)
                        }
                        fields[field] = v
                }
        }

        return fields
}

// IsLocalhost checks if the registry host is local.
func IsLocalhost(host string) bool {
        if h, _, err := net.SplitHostPort(host); err == nil {
                host = h
        }

        if host == "localhost" {
                return true
        }

        ip := net.ParseIP(host)
        return ip.IsLoopback()
}

// NewHTTPFallback returns http.RoundTripper which allows fallback from https to
// http for registry endpoints with configurations for both http and TLS,
// such as defaulted localhost endpoints.
func NewHTTPFallback(transport http.RoundTripper) http.RoundTripper {
        return &httpFallback{
                super: transport,
        }
}

type httpFallback struct {
        super http.RoundTripper
        host  string
}

func (f *httpFallback) RoundTrip(r *http.Request) (*http.Response, error) {
        // only fall back if the same host had previously fell back
        if f.host != r.URL.Host {
                resp, err := f.super.RoundTrip(r)
                if !isTLSError(err) {
                        return resp, err
                }
        }

        plainHTTPUrl := *r.URL
        plainHTTPUrl.Scheme = "http"

        plainHTTPRequest := *r
        plainHTTPRequest.URL = &plainHTTPUrl

        if f.host != r.URL.Host {
                f.host = r.URL.Host

                // update body on the second attempt
                if r.Body != nil && r.GetBody != nil {
                        body, err := r.GetBody()
                        if err != nil {
                                return nil, err
                        }
                        plainHTTPRequest.Body = body
                }
        }

        return f.super.RoundTrip(&plainHTTPRequest)
}

func isTLSError(err error) bool {
        if err == nil {
                return false
        }
        var tlsErr tls.RecordHeaderError
        if errors.As(err, &tlsErr) && string(tlsErr.RecordHeader[:]) == "HTTP/" {
                return true
        }
        if strings.Contains(err.Error(), "TLS handshake timeout") {
                return true
        }

        return false
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package schema1 provides a converter to fetch an image formatted in Docker Image Manifest v2, Schema 1.
//
// Deprecated: use images formatted in Docker Image Manifest v2, Schema 2, or OCI Image Spec v1.
package schema1

import (
        "bytes"
        "context"
        "encoding/base64"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
        "github.com/containerd/containerd/v2/pkg/deprecation"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        specs "github.com/opencontainers/image-spec/specs-go"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "golang.org/x/sync/errgroup"
)

const (
        manifestSizeLimit            = 8e6 // 8MB
        labelDockerSchema1EmptyLayer = "containerd.io/docker.schema1.empty-layer"
)

type blobState struct {
        diffID digest.Digest
        empty  bool
}

// Converter converts schema1 manifests to schema2 on fetch
type Converter struct {
        contentStore content.Store
        fetcher      remotes.Fetcher

        pulledManifest *manifest

        mu         sync.Mutex
        blobMap    map[digest.Digest]blobState
        layerBlobs map[digest.Digest]ocispec.Descriptor
}

var ErrDisabled = fmt.Errorf("Pulling Schema 1 images have been deprecated and disabled by default since containerd v2.0. "+
        "As a workaround you may set an environment variable `%s=1`, but this will be completely removed in containerd v2.1.",
        deprecation.EnvPullSchema1Image)

// NewConverter returns a new converter
func NewConverter(contentStore content.Store, fetcher remotes.Fetcher) (*Converter, error) {
        s := os.Getenv(deprecation.EnvPullSchema1Image)
        if s == "" {
                return nil, ErrDisabled
        }
        enable, err := strconv.ParseBool(s)
        if err != nil {
                return nil, fmt.Errorf("failed to parse `%s=%s`: %w", deprecation.EnvPullSchema1Image, s, err)
        }
        if !enable {
                return nil, ErrDisabled
        }
        log.L.Warn(ErrDisabled)
        return &Converter{
                contentStore: contentStore,
                fetcher:      fetcher,
                blobMap:      map[digest.Digest]blobState{},
                layerBlobs:   map[digest.Digest]ocispec.Descriptor{},
        }, nil
}

// Handle fetching descriptors for a docker media type
func (c *Converter) Handle(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
        switch desc.MediaType {
        case images.MediaTypeDockerSchema1Manifest:
                if err := c.fetchManifest(ctx, desc); err != nil {
                        return nil, err
                }

                m := c.pulledManifest
                if len(m.FSLayers) != len(m.History) {
                        return nil, errors.New("invalid schema 1 manifest, history and layer mismatch")
                }
                descs := make([]ocispec.Descriptor, 0, len(c.pulledManifest.FSLayers))

                for i := range m.FSLayers {
                        if _, ok := c.blobMap[c.pulledManifest.FSLayers[i].BlobSum]; !ok {
                                empty, err := isEmptyLayer([]byte(m.History[i].V1Compatibility))
                                if err != nil {
                                        return nil, err
                                }

                                // Do no attempt to download a known empty blob
                                if !empty {
                                        descs = append([]ocispec.Descriptor{
                                                {
                                                        MediaType: images.MediaTypeDockerSchema2LayerGzip,
                                                        Digest:    c.pulledManifest.FSLayers[i].BlobSum,
                                                        Size:      -1,
                                                },
                                        }, descs...)
                                }
                                c.blobMap[c.pulledManifest.FSLayers[i].BlobSum] = blobState{
                                        empty: empty,
                                }
                        }
                }
                return descs, nil
        case images.MediaTypeDockerSchema2LayerGzip:
                if c.pulledManifest == nil {
                        return nil, errors.New("manifest required for schema 1 blob pull")
                }
                return nil, c.fetchBlob(ctx, desc)
        default:
                return nil, fmt.Errorf("%v not support for schema 1 manifests", desc.MediaType)
        }
}

// ConvertOptions provides options on converting a docker schema1 manifest.
type ConvertOptions struct {
        // ManifestMediaType specifies the media type of the manifest OCI descriptor.
        ManifestMediaType string

        // ConfigMediaType specifies the media type of the manifest config OCI
        // descriptor.
        ConfigMediaType string
}

// ConvertOpt allows configuring a convert operation.
type ConvertOpt func(context.Context, *ConvertOptions) error

// UseDockerSchema2 is used to indicate that a schema1 manifest should be
// converted into the media types for a docker schema2 manifest.
func UseDockerSchema2() ConvertOpt {
        return func(ctx context.Context, o *ConvertOptions) error {
                o.ManifestMediaType = images.MediaTypeDockerSchema2Manifest
                o.ConfigMediaType = images.MediaTypeDockerSchema2Config
                return nil
        }
}

// Convert a docker manifest to an OCI descriptor
func (c *Converter) Convert(ctx context.Context, opts ...ConvertOpt) (ocispec.Descriptor, error) {
        co := ConvertOptions{
                ManifestMediaType: ocispec.MediaTypeImageManifest,
                ConfigMediaType:   ocispec.MediaTypeImageConfig,
        }
        for _, opt := range opts {
                if err := opt(ctx, &co); err != nil {
                        return ocispec.Descriptor{}, err
                }
        }

        history, diffIDs, err := c.schema1ManifestHistory()
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("schema 1 conversion failed: %w", err)
        }

        var img ocispec.Image
        if err := json.Unmarshal([]byte(c.pulledManifest.History[0].V1Compatibility), &img); err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to unmarshal image from schema 1 history: %w", err)
        }

        img.History = history
        img.RootFS = ocispec.RootFS{
                Type:    "layers",
                DiffIDs: diffIDs,
        }

        b, err := json.MarshalIndent(img, "", "   ")
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to marshal image: %w", err)
        }

        config := ocispec.Descriptor{
                MediaType: co.ConfigMediaType,
                Digest:    digest.Canonical.FromBytes(b),
                Size:      int64(len(b)),
        }

        layers := make([]ocispec.Descriptor, len(diffIDs))
        for i, diffID := range diffIDs {
                layers[i] = c.layerBlobs[diffID]
        }

        manifest := ocispec.Manifest{
                Versioned: specs.Versioned{
                        SchemaVersion: 2,
                },
                Config: config,
                Layers: layers,
        }

        mb, err := json.MarshalIndent(manifest, "", "   ")
        if err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to marshal image: %w", err)
        }

        desc := ocispec.Descriptor{
                MediaType: co.ManifestMediaType,
                Digest:    digest.Canonical.FromBytes(mb),
                Size:      int64(len(mb)),
        }

        labels := map[string]string{}
        labels["containerd.io/gc.ref.content.0"] = manifest.Config.Digest.String()
        for i, ch := range manifest.Layers {
                labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i+1)] = ch.Digest.String()
        }

        ref := remotes.MakeRefKey(ctx, desc)
        if err := content.WriteBlob(ctx, c.contentStore, ref, bytes.NewReader(mb), desc, content.WithLabels(labels)); err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to write image manifest: %w", err)
        }

        ref = remotes.MakeRefKey(ctx, config)
        if err := content.WriteBlob(ctx, c.contentStore, ref, bytes.NewReader(b), config); err != nil {
                return ocispec.Descriptor{}, fmt.Errorf("failed to write image config: %w", err)
        }

        return desc, nil
}

// ReadStripSignature reads in a schema1 manifest and returns a byte array
// with the "signatures" field stripped
func ReadStripSignature(schema1Blob io.Reader) ([]byte, error) {
        b, err := io.ReadAll(io.LimitReader(schema1Blob, manifestSizeLimit)) // limit to 8MB
        if err != nil {
                return nil, err
        }

        return stripSignature(b)
}

func (c *Converter) fetchManifest(ctx context.Context, desc ocispec.Descriptor) error {
        log.G(ctx).Debug("fetch schema 1")

        rc, err := c.fetcher.Fetch(ctx, desc)
        if err != nil {
                return err
        }

        b, err := ReadStripSignature(rc)
        rc.Close()
        if err != nil {
                return err
        }

        var m manifest
        if err := json.Unmarshal(b, &m); err != nil {
                return err
        }
        if len(m.Manifests) != 0 || len(m.Layers) != 0 {
                return errors.New("converter: expected schema1 document but found extra keys")
        }
        c.pulledManifest = &m

        return nil
}

func (c *Converter) fetchBlob(ctx context.Context, desc ocispec.Descriptor) error {
        log.G(ctx).Debug("fetch blob")

        var (
                ref            = remotes.MakeRefKey(ctx, desc)
                calc           = newBlobStateCalculator()
                compressMethod = compression.Gzip
        )

        // size may be unknown, set to zero for content ingest
        ingestDesc := desc
        if ingestDesc.Size == -1 {
                ingestDesc.Size = 0
        }

        cw, err := content.OpenWriter(ctx, c.contentStore, content.WithRef(ref), content.WithDescriptor(ingestDesc))
        if err != nil {
                if !errdefs.IsAlreadyExists(err) {
                        return err
                }

                reuse, err := c.reuseLabelBlobState(ctx, desc)
                if err != nil {
                        return err
                }

                if reuse {
                        return nil
                }

                ra, err := c.contentStore.ReaderAt(ctx, desc)
                if err != nil {
                        return err
                }
                defer ra.Close()

                r, err := compression.DecompressStream(content.NewReader(ra))
                if err != nil {
                        return err
                }

                compressMethod = r.GetCompression()
                _, err = io.Copy(calc, r)
                r.Close()
                if err != nil {
                        return err
                }
        } else {
                defer cw.Close()

                rc, err := c.fetcher.Fetch(ctx, desc)
                if err != nil {
                        return err
                }
                defer rc.Close()

                eg, _ := errgroup.WithContext(ctx)
                pr, pw := io.Pipe()

                eg.Go(func() error {
                        r, err := compression.DecompressStream(pr)
                        if err != nil {
                                return err
                        }

                        compressMethod = r.GetCompression()
                        _, err = io.Copy(calc, r)
                        r.Close()
                        pr.CloseWithError(err)
                        return err
                })

                eg.Go(func() error {
                        defer pw.Close()

                        return content.Copy(ctx, cw, io.TeeReader(rc, pw), ingestDesc.Size, ingestDesc.Digest)
                })

                if err := eg.Wait(); err != nil {
                        return err
                }
        }

        if desc.Size == -1 {
                info, err := c.contentStore.Info(ctx, desc.Digest)
                if err != nil {
                        return fmt.Errorf("failed to get blob info: %w", err)
                }
                desc.Size = info.Size
        }

        if compressMethod == compression.Uncompressed {
                log.G(ctx).WithField("id", desc.Digest).Debugf("changed media type for uncompressed schema1 layer blob")
                desc.MediaType = images.MediaTypeDockerSchema2Layer
        }

        state := calc.State()

        cinfo := content.Info{
                Digest: desc.Digest,
                Labels: map[string]string{
                        labels.LabelUncompressed:     state.diffID.String(),
                        labelDockerSchema1EmptyLayer: strconv.FormatBool(state.empty),
                },
        }

        if _, err := c.contentStore.Update(ctx, cinfo, "labels."+labels.LabelUncompressed, fmt.Sprintf("labels.%s", labelDockerSchema1EmptyLayer)); err != nil {
                return fmt.Errorf("failed to update uncompressed label: %w", err)
        }

        c.mu.Lock()
        c.blobMap[desc.Digest] = state
        c.layerBlobs[state.diffID] = desc
        c.mu.Unlock()

        return nil
}

func (c *Converter) reuseLabelBlobState(ctx context.Context, desc ocispec.Descriptor) (bool, error) {
        cinfo, err := c.contentStore.Info(ctx, desc.Digest)
        if err != nil {
                return false, fmt.Errorf("failed to get blob info: %w", err)
        }
        desc.Size = cinfo.Size

        diffID, ok := cinfo.Labels[labels.LabelUncompressed]
        if !ok {
                return false, nil
        }

        emptyVal, ok := cinfo.Labels[labelDockerSchema1EmptyLayer]
        if !ok {
                return false, nil
        }

        isEmpty, err := strconv.ParseBool(emptyVal)
        if err != nil {
                log.G(ctx).WithField("id", desc.Digest).Warnf("failed to parse bool from label %s: %v", labelDockerSchema1EmptyLayer, isEmpty)
                return false, nil
        }

        bState := blobState{empty: isEmpty}

        if bState.diffID, err = digest.Parse(diffID); err != nil {
                log.G(ctx).WithField("id", desc.Digest).Warnf("failed to parse digest from label %s: %v", labels.LabelUncompressed, diffID)
                return false, nil
        }

        // NOTE: there is no need to read header to get compression method
        // because there are only two kinds of methods.
        if bState.diffID == desc.Digest {
                desc.MediaType = images.MediaTypeDockerSchema2Layer
        } else {
                desc.MediaType = images.MediaTypeDockerSchema2LayerGzip
        }

        c.mu.Lock()
        c.blobMap[desc.Digest] = bState
        c.layerBlobs[bState.diffID] = desc
        c.mu.Unlock()
        return true, nil
}

func (c *Converter) schema1ManifestHistory() ([]ocispec.History, []digest.Digest, error) {
        if c.pulledManifest == nil {
                return nil, nil, errors.New("missing schema 1 manifest for conversion")
        }
        m := *c.pulledManifest

        if len(m.History) == 0 {
                return nil, nil, errors.New("no history")
        }

        history := make([]ocispec.History, len(m.History))
        diffIDs := []digest.Digest{}
        for i := range m.History {
                var h v1History
                if err := json.Unmarshal([]byte(m.History[i].V1Compatibility), &h); err != nil {
                        return nil, nil, fmt.Errorf("failed to unmarshal history: %w", err)
                }

                blobSum := m.FSLayers[i].BlobSum

                state := c.blobMap[blobSum]

                history[len(history)-i-1] = ocispec.History{
                        Author:     h.Author,
                        Comment:    h.Comment,
                        Created:    &h.Created,
                        CreatedBy:  strings.Join(h.ContainerConfig.Cmd, " "),
                        EmptyLayer: state.empty,
                }

                if !state.empty {
                        diffIDs = append([]digest.Digest{state.diffID}, diffIDs...)

                }
        }

        return history, diffIDs, nil
}

type fsLayer struct {
        BlobSum digest.Digest `json:"blobSum"`
}

type history struct {
        V1Compatibility string `json:"v1Compatibility"`
}

type manifest struct {
        FSLayers  []fsLayer       `json:"fsLayers"`
        History   []history       `json:"history"`
        Layers    json.RawMessage `json:"layers,omitempty"`    // OCI manifest
        Manifests json.RawMessage `json:"manifests,omitempty"` // OCI index
}

type v1History struct {
        Author          string    `json:"author,omitempty"`
        Created         time.Time `json:"created"`
        Comment         string    `json:"comment,omitempty"`
        ThrowAway       *bool     `json:"throwaway,omitempty"`
        Size            *int      `json:"Size,omitempty"` // used before ThrowAway field
        ContainerConfig struct {
                Cmd []string `json:"Cmd,omitempty"`
        } `json:"container_config,omitempty"`
}

// isEmptyLayer returns whether the v1 compatibility history describes an
// empty layer. A return value of true indicates the layer is empty,
// however false does not indicate non-empty.
func isEmptyLayer(compatHistory []byte) (bool, error) {
        var h v1History
        if err := json.Unmarshal(compatHistory, &h); err != nil {
                return false, err
        }

        if h.ThrowAway != nil {
                return *h.ThrowAway, nil
        }
        if h.Size != nil {
                return *h.Size == 0, nil
        }

        // If no `Size` or `throwaway` field is given, then
        // it cannot be determined whether the layer is empty
        // from the history, return false
        return false, nil
}

type signature struct {
        Signatures []jsParsedSignature `json:"signatures"`
}

type jsParsedSignature struct {
        Protected string `json:"protected"`
}

type protectedBlock struct {
        Length int    `json:"formatLength"`
        Tail   string `json:"formatTail"`
}

// joseBase64UrlDecode decodes the given string using the standard base64 url
// decoder but first adds the appropriate number of trailing '=' characters in
// accordance with the jose specification.
// http://tools.ietf.org/html/draft-ietf-jose-json-web-signature-31#section-2
func joseBase64UrlDecode(s string) ([]byte, error) {
        switch len(s) % 4 {
        case 0:
        case 2:
                s += "=="
        case 3:
                s += "="
        default:
                return nil, errors.New("illegal base64url string")
        }
        return base64.URLEncoding.DecodeString(s)
}

func stripSignature(b []byte) ([]byte, error) {
        var sig signature
        if err := json.Unmarshal(b, &sig); err != nil {
                return nil, err
        }
        if len(sig.Signatures) == 0 {
                return nil, errors.New("no signatures")
        }
        pb, err := joseBase64UrlDecode(sig.Signatures[0].Protected)
        if err != nil {
                return nil, fmt.Errorf("could not decode %s: %w", sig.Signatures[0].Protected, err)
        }

        var protected protectedBlock
        if err := json.Unmarshal(pb, &protected); err != nil {
                return nil, err
        }

        if protected.Length > len(b) {
                return nil, errors.New("invalid protected length block")
        }

        tail, err := joseBase64UrlDecode(protected.Tail)
        if err != nil {
                return nil, fmt.Errorf("invalid tail base 64 value: %w", err)
        }

        return append(b[:protected.Length], tail...), nil
}

type blobStateCalculator struct {
        empty    bool
        digester digest.Digester
}

func newBlobStateCalculator() *blobStateCalculator {
        return &blobStateCalculator{
                empty:    true,
                digester: digest.Canonical.Digester(),
        }
}

func (c *blobStateCalculator) Write(p []byte) (int, error) {
        if c.empty {
                for _, b := range p {
                        if b != 0x00 {
                                c.empty = false
                                break
                        }
                }
        }
        return c.digester.Hash().Write(p)
}

func (c *blobStateCalculator) State() blobState {
        return blobState{
                empty:  c.empty,
                diffID: c.digester.Digest(),
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "context"
        "fmt"
        "net/url"
        "sort"
        "strings"

        "github.com/containerd/containerd/v2/pkg/reference"
)

// RepositoryScope returns a repository scope string such as "repository:foo/bar:pull"
// for "host/foo/bar:baz".
// When push is true, both pull and push are added to the scope.
func RepositoryScope(refspec reference.Spec, push bool) (string, error) {
        u, err := url.Parse("dummy://" + refspec.Locator)
        if err != nil {
                return "", err
        }
        s := "repository:" + strings.TrimPrefix(u.Path, "/") + ":pull"
        if push {
                s += ",push"
        }
        return s, nil
}

// tokenScopesKey is used for the key for context.WithValue().
// value: []string (e.g. {"registry:foo/bar:pull"})
type tokenScopesKey struct{}

// ContextWithRepositoryScope returns a context with tokenScopesKey{} and the repository scope value.
func ContextWithRepositoryScope(ctx context.Context, refspec reference.Spec, push bool) (context.Context, error) {
        s, err := RepositoryScope(refspec, push)
        if err != nil {
                return nil, err
        }
        return WithScope(ctx, s), nil
}

// WithScope appends a custom registry auth scope to the context.
func WithScope(ctx context.Context, scope string) context.Context {
        var scopes []string
        if v := ctx.Value(tokenScopesKey{}); v != nil {
                scopes = v.([]string)
                scopes = append(scopes, scope)
        } else {
                scopes = []string{scope}
        }
        return context.WithValue(ctx, tokenScopesKey{}, scopes)
}

// ContextWithAppendPullRepositoryScope is used to append repository pull
// scope into existing scopes indexed by the tokenScopesKey{}.
func ContextWithAppendPullRepositoryScope(ctx context.Context, repo string) context.Context {
        return WithScope(ctx, fmt.Sprintf("repository:%s:pull", repo))
}

// GetTokenScopes returns deduplicated and sorted scopes from ctx.Value(tokenScopesKey{}) and common scopes.
func GetTokenScopes(ctx context.Context, common []string) []string {
        scopes := []string{}
        if x := ctx.Value(tokenScopesKey{}); x != nil {
                scopes = append(scopes, x.([]string)...)
        }

        scopes = append(scopes, common...)
        sort.Strings(scopes)

        if len(scopes) == 0 {
                return scopes
        }

        l := 0
        for idx := 1; idx < len(scopes); idx++ {
                // Note: this comparison is unaware of the scope grammar (https://docs.docker.com/registry/spec/auth/scope/)
                // So, "repository:foo/bar:pull,push" != "repository:foo/bar:push,pull", although semantically they are equal.
                if scopes[l] == scopes[idx] {
                        continue
                }

                l++
                scopes[l] = scopes[idx]
        }
        return scopes[:l+1]
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package docker

import (
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/errdefs"
        "github.com/moby/locker"
)

// Status of a content operation
type Status struct {
        content.Status

        Committed bool

        // ErrClosed contains error encountered on close.
        ErrClosed error

        // UploadUUID is used by the Docker registry to reference blob uploads
        UploadUUID string

        // PushStatus contains status related to push.
        PushStatus
}

type PushStatus struct {
        // MountedFrom is the source content was cross-repo mounted from (empty if no cross-repo mount was performed).
        MountedFrom string

        // Exists indicates whether content already exists in the repository and wasn't uploaded.
        Exists bool
}

// StatusTracker to track status of operations
type StatusTracker interface {
        GetStatus(string) (Status, error)
        SetStatus(string, Status)
}

// StatusTrackLocker to track status of operations with lock
type StatusTrackLocker interface {
        StatusTracker
        Lock(string)
        Unlock(string)
}

type memoryStatusTracker struct {
        statuses map[string]Status
        m        sync.Mutex
        locker   *locker.Locker
}

// NewInMemoryTracker returns a StatusTracker that tracks content status in-memory
func NewInMemoryTracker() StatusTrackLocker {
        return &memoryStatusTracker{
                statuses: map[string]Status{},
                locker:   locker.New(),
        }
}

func (t *memoryStatusTracker) GetStatus(ref string) (Status, error) {
        t.m.Lock()
        defer t.m.Unlock()
        status, ok := t.statuses[ref]
        if !ok {
                return Status{}, fmt.Errorf("status for ref %v: %w", ref, errdefs.ErrNotFound)
        }
        return status, nil
}

func (t *memoryStatusTracker) SetStatus(ref string, status Status) {
        t.m.Lock()
        t.statuses[ref] = status
        t.m.Unlock()
}

func (t *memoryStatusTracker) Lock(ref string) {
        t.locker.Lock(ref)
}

func (t *memoryStatusTracker) Unlock(ref string) {
        t.locker.Unlock(ref)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package errors

import (
        "fmt"
        "io"
        "net/http"
)

var _ error = ErrUnexpectedStatus{}

// ErrUnexpectedStatus is returned if a registry API request returned with unexpected HTTP status
type ErrUnexpectedStatus struct {
        Status                    string
        StatusCode                int
        Body                      []byte
        RequestURL, RequestMethod string
}

func (e ErrUnexpectedStatus) Error() string {
        return fmt.Sprintf("unexpected status from %s request to %s: %s", e.RequestMethod, e.RequestURL, e.Status)
}

// NewUnexpectedStatusErr creates an ErrUnexpectedStatus from HTTP response
func NewUnexpectedStatusErr(resp *http.Response) error {
        var b []byte
        if resp.Body != nil {
                b, _ = io.ReadAll(io.LimitReader(resp.Body, 64000)) // 64KB
        }
        err := ErrUnexpectedStatus{
                Body:          b,
                Status:        resp.Status,
                StatusCode:    resp.StatusCode,
                RequestMethod: resp.Request.Method,
        }
        if resp.Request.URL != nil {
                err.RequestURL = resp.Request.URL.String()
        }
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package remotes

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "strings"
        "sync"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "golang.org/x/sync/semaphore"
)

type refKeyPrefix struct{}

// WithMediaTypeKeyPrefix adds a custom key prefix for a media type which is used when storing
// data in the content store from the FetchHandler.
//
// Used in `MakeRefKey` to determine what the key prefix should be.
func WithMediaTypeKeyPrefix(ctx context.Context, mediaType, prefix string) context.Context {
        var values map[string]string
        if v := ctx.Value(refKeyPrefix{}); v != nil {
                values = v.(map[string]string)
        } else {
                values = make(map[string]string)
        }

        values[mediaType] = prefix
        return context.WithValue(ctx, refKeyPrefix{}, values)
}

// MakeRefKey returns a unique reference for the descriptor. This reference can be
// used to lookup ongoing processes related to the descriptor. This function
// may look to the context to namespace the reference appropriately.
func MakeRefKey(ctx context.Context, desc ocispec.Descriptor) string {
        key := desc.Digest.String()
        if desc.Annotations != nil {
                if name, ok := desc.Annotations[ocispec.AnnotationRefName]; ok {
                        key = fmt.Sprintf("%s@%s", name, desc.Digest.String())
                }
        }

        if v := ctx.Value(refKeyPrefix{}); v != nil {
                values := v.(map[string]string)
                if prefix := values[desc.MediaType]; prefix != "" {
                        return prefix + "-" + key
                }
        }

        switch {
        case images.IsManifestType(desc.MediaType):
                return "manifest-" + key
        case images.IsIndexType(desc.MediaType):
                return "index-" + key
        case images.IsLayerType(desc.MediaType):
                return "layer-" + key
        case images.IsKnownConfig(desc.MediaType):
                return "config-" + key
        default:
                log.G(ctx).Warnf("reference for unknown type: %s", desc.MediaType)
                return "unknown-" + key
        }
}

// FetchHandler returns a handler that will fetch all content into the ingester
// discovered in a call to Dispatch. Use with ChildrenHandler to do a full
// recursive fetch.
func FetchHandler(ingester content.Ingester, fetcher Fetcher) images.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                        "digest":    desc.Digest,
                        "mediatype": desc.MediaType,
                        "size":      desc.Size,
                }))

                if desc.MediaType == images.MediaTypeDockerSchema1Manifest {
                        return nil, fmt.Errorf("%v not supported", desc.MediaType)
                }
                err := Fetch(ctx, ingester, fetcher, desc)
                if errdefs.IsAlreadyExists(err) {
                        return nil, nil
                }
                return nil, err
        }
}

// Fetch fetches the given digest into the provided ingester
func Fetch(ctx context.Context, ingester content.Ingester, fetcher Fetcher, desc ocispec.Descriptor) error {
        log.G(ctx).Debug("fetch")

        cw, err := content.OpenWriter(ctx, ingester, content.WithRef(MakeRefKey(ctx, desc)), content.WithDescriptor(desc))
        if err != nil {
                return err
        }
        defer cw.Close()

        ws, err := cw.Status()
        if err != nil {
                return err
        }

        if desc.Size == 0 {
                // most likely a poorly configured registry/web front end which responded with no
                // Content-Length header; unable (not to mention useless) to commit a 0-length entry
                // into the content store. Error out here otherwise the error sent back is confusing
                return fmt.Errorf("unable to fetch descriptor (%s) which reports content size of zero: %w", desc.Digest, errdefs.ErrInvalidArgument)
        }
        if ws.Offset == desc.Size {
                // If writer is already complete, commit and return
                err := cw.Commit(ctx, desc.Size, desc.Digest)
                if err != nil && !errdefs.IsAlreadyExists(err) {
                        return fmt.Errorf("failed commit on ref %q: %w", ws.Ref, err)
                }
                return err
        }

        if desc.Size == int64(len(desc.Data)) {
                return content.Copy(ctx, cw, bytes.NewReader(desc.Data), desc.Size, desc.Digest)
        }

        rc, err := fetcher.Fetch(ctx, desc)
        if err != nil {
                return err
        }
        defer rc.Close()

        return content.Copy(ctx, cw, rc, desc.Size, desc.Digest)
}

// PushHandler returns a handler that will push all content from the provider
// using a writer from the pusher.
func PushHandler(pusher Pusher, provider content.Provider) images.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                        "digest":    desc.Digest,
                        "mediatype": desc.MediaType,
                        "size":      desc.Size,
                }))

                err := push(ctx, provider, pusher, desc)
                return nil, err
        }
}

func push(ctx context.Context, provider content.Provider, pusher Pusher, desc ocispec.Descriptor) error {
        log.G(ctx).Debug("push")

        var (
                cw  content.Writer
                err error
        )
        if cs, ok := pusher.(content.Ingester); ok {
                cw, err = content.OpenWriter(ctx, cs, content.WithRef(MakeRefKey(ctx, desc)), content.WithDescriptor(desc))
        } else {
                cw, err = pusher.Push(ctx, desc)
        }
        if err != nil {
                if !errdefs.IsAlreadyExists(err) {
                        return err
                }

                return nil
        }
        defer cw.Close()

        ra, err := provider.ReaderAt(ctx, desc)
        if err != nil {
                return err
        }
        defer ra.Close()

        rd := io.NewSectionReader(ra, 0, desc.Size)
        return content.Copy(ctx, cw, rd, desc.Size, desc.Digest)
}

// PushContent pushes content specified by the descriptor from the provider.
//
// Base handlers can be provided which will be called before any push specific
// handlers.
//
// If the passed in content.Provider is also a content.InfoProvider (such as
// content.Manager) then this will also annotate the distribution sources using
// labels prefixed with "containerd.io/distribution.source".
func PushContent(ctx context.Context, pusher Pusher, desc ocispec.Descriptor, store content.Provider, limiter *semaphore.Weighted, platform platforms.MatchComparer, wrapper func(h images.Handler) images.Handler) error {

        var m sync.Mutex
        manifests := []ocispec.Descriptor{}
        indexStack := []ocispec.Descriptor{}

        filterHandler := images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                if images.IsManifestType(desc.MediaType) {
                        m.Lock()
                        manifests = append(manifests, desc)
                        m.Unlock()
                        return nil, images.ErrStopHandler
                } else if images.IsIndexType(desc.MediaType) {
                        m.Lock()
                        indexStack = append(indexStack, desc)
                        m.Unlock()
                        return nil, images.ErrStopHandler
                }
                return nil, nil
        })

        pushHandler := PushHandler(pusher, store)

        platformFilterhandler := images.FilterPlatforms(images.ChildrenHandler(store), platform)

        var handler images.Handler
        if m, ok := store.(content.InfoProvider); ok {
                annotateHandler := annotateDistributionSourceHandler(platformFilterhandler, m)
                handler = images.Handlers(annotateHandler, filterHandler, pushHandler)
        } else {
                handler = images.Handlers(platformFilterhandler, filterHandler, pushHandler)
        }

        if wrapper != nil {
                handler = wrapper(handler)
        }

        if err := images.Dispatch(ctx, handler, limiter, desc); err != nil {
                return err
        }

        if err := images.Dispatch(ctx, pushHandler, limiter, manifests...); err != nil {
                return err
        }

        // Iterate in reverse order as seen, parent always uploaded after child
        for i := len(indexStack) - 1; i >= 0; i-- {
                err := images.Dispatch(ctx, pushHandler, limiter, indexStack[i])
                if err != nil {
                        // TODO(estesp): until we have a more complete method for index push, we need to report
                        // missing dependencies in an index/manifest list by sensing the "400 Bad Request"
                        // as a marker for this problem
                        if errors.Unwrap(err) != nil && strings.Contains(errors.Unwrap(err).Error(), "400 Bad Request") {
                                return fmt.Errorf("manifest list/index references to blobs and/or manifests are missing in your target registry: %w", err)
                        }
                        return err
                }
        }

        return nil
}

// SkipNonDistributableBlobs returns a handler that skips blobs that have a media type that is "non-distributeable".
// An example of this kind of content would be a Windows base layer, which is not supposed to be redistributed.
//
// This is based on the media type of the content:
//   - application/vnd.oci.image.layer.nondistributable
//   - application/vnd.docker.image.rootfs.foreign
func SkipNonDistributableBlobs(f images.HandlerFunc) images.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                if images.IsNonDistributable(desc.MediaType) {
                        log.G(ctx).WithField("digest", desc.Digest).WithField("mediatype", desc.MediaType).Debug("Skipping non-distributable blob")
                        return nil, images.ErrSkipDesc
                }

                children, err := f(ctx, desc)
                if err != nil {
                        return nil, err
                }
                if len(children) == 0 {
                        return nil, nil
                }

                out := make([]ocispec.Descriptor, 0, len(children))
                for _, child := range children {
                        if !images.IsNonDistributable(child.MediaType) {
                                out = append(out, child)
                        } else {
                                log.G(ctx).WithField("digest", child.Digest).WithField("mediatype", child.MediaType).Debug("Skipping non-distributable blob")
                        }
                }
                return out, nil
        }
}

// FilterManifestByPlatformHandler allows Handler to handle non-target
// platform's manifest and configuration data.
func FilterManifestByPlatformHandler(f images.HandlerFunc, m platforms.Matcher) images.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                children, err := f(ctx, desc)
                if err != nil {
                        return nil, err
                }

                // no platform information
                if desc.Platform == nil || m == nil {
                        return children, nil
                }

                if images.IsManifestType(desc.MediaType) && !m.Match(*desc.Platform) {
                        var descs []ocispec.Descriptor
                        for _, child := range children {
                                if images.IsConfigType(child.MediaType) {
                                        descs = append(descs, child)
                                }
                        }
                        return descs, nil
                }
                return children, nil
        }
}

// annotateDistributionSourceHandler add distribution source label into
// annotation of config or blob descriptor.
func annotateDistributionSourceHandler(f images.HandlerFunc, provider content.InfoProvider) images.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                children, err := f(ctx, desc)
                if err != nil {
                        return nil, err
                }

                // Distribution source is only used for config or blob but may be inherited from
                // a manifest or manifest list
                if !images.IsManifestType(desc.MediaType) && !images.IsIndexType(desc.MediaType) {
                        return children, nil
                }

                parentSourceAnnotations := desc.Annotations
                var parentLabels map[string]string
                if pi, err := provider.Info(ctx, desc.Digest); err != nil {
                        if !errdefs.IsNotFound(err) {
                                return nil, err
                        }
                } else {
                        parentLabels = pi.Labels
                }

                for i := range children {
                        child := children[i]

                        info, err := provider.Info(ctx, child.Digest)
                        if err != nil {
                                if !errdefs.IsNotFound(err) {
                                        return nil, err
                                }
                        }
                        copyDistributionSourceLabels(info.Labels, &child)

                        // Annotate with parent labels for cross repo mount or fetch.
                        // Parent sources may apply to all children since most registries
                        // enforce that children exist before the manifests.
                        copyDistributionSourceLabels(parentSourceAnnotations, &child)
                        copyDistributionSourceLabels(parentLabels, &child)

                        children[i] = child
                }
                return children, nil
        }
}

func copyDistributionSourceLabels(from map[string]string, to *ocispec.Descriptor) {
        for k, v := range from {
                if !strings.HasPrefix(k, labels.LabelDistributionSource+".") {
                        continue
                }

                if to.Annotations == nil {
                        to.Annotations = make(map[string]string)
                } else {
                        // Only propagate the parent label if the child doesn't already have it.
                        if _, has := to.Annotations[k]; has {
                                continue
                        }
                }
                to.Annotations[k] = v
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package remotes

import (
        "context"
        "io"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Resolver provides remotes based on a locator.
type Resolver interface {
        // Resolve attempts to resolve the reference into a name and descriptor.
        //
        // The argument `ref` should be a scheme-less URI representing the remote.
        // Structurally, it has a host and path. The "host" can be used to directly
        // reference a specific host or be matched against a specific handler.
        //
        // The returned name should be used to identify the referenced entity.
        // Depending on the remote namespace, this may be immutable or mutable.
        // While the name may differ from ref, it should itself be a valid ref.
        //
        // If the resolution fails, an error will be returned.
        Resolve(ctx context.Context, ref string) (name string, desc ocispec.Descriptor, err error)

        // Fetcher returns a new fetcher for the provided reference.
        // All content fetched from the returned fetcher will be
        // from the namespace referred to by ref.
        Fetcher(ctx context.Context, ref string) (Fetcher, error)

        // Pusher returns a new pusher for the provided reference
        // The returned Pusher should satisfy content.Ingester and concurrent attempts
        // to push the same blob using the Ingester API should result in ErrUnavailable.
        Pusher(ctx context.Context, ref string) (Pusher, error)
}

// Fetcher fetches content.
// A fetcher implementation may implement the FetcherByDigest interface too.
type Fetcher interface {
        // Fetch the resource identified by the descriptor.
        Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error)
}

// FetcherByDigest fetches content by the digest.
type FetcherByDigest interface {
        // FetchByDigest fetches the resource identified by the digest.
        //
        // FetcherByDigest usually returns an incomplete descriptor.
        // Typically, the media type is always set to "application/octet-stream",
        // and the annotations are unset.
        FetchByDigest(ctx context.Context, dgst digest.Digest, opts ...FetchByDigestOpts) (io.ReadCloser, ocispec.Descriptor, error)
}

// Pusher pushes content
type Pusher interface {
        // Push returns a content writer for the given resource identified
        // by the descriptor.
        Push(ctx context.Context, d ocispec.Descriptor) (content.Writer, error)
}

// FetcherFunc allows package users to implement a Fetcher with just a
// function.
type FetcherFunc func(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error)

// Fetch content
func (fn FetcherFunc) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error) {
        return fn(ctx, desc)
}

// PusherFunc allows package users to implement a Pusher with just a
// function.
type PusherFunc func(ctx context.Context, desc ocispec.Descriptor) (content.Writer, error)

// Push content
func (fn PusherFunc) Push(ctx context.Context, desc ocispec.Descriptor) (content.Writer, error) {
        return fn(ctx, desc)
}

// FetchByDigestConfig provides configuration for fetching content by digest
type FetchByDigestConfig struct {
        //Mediatype specifies mediatype header to append for fetch request
        Mediatype string
}

// FetchByDigestOpts allows callers to set options for fetch object
type FetchByDigestOpts func(context.Context, *FetchByDigestConfig) error

// WithMediaType sets the media type header for fetch request
func WithMediaType(mediatype string) FetchByDigestOpts {
        return func(ctx context.Context, cfg *FetchByDigestConfig) error {
                cfg.Mediatype = mediatype
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package runtime

import (
        "github.com/containerd/containerd/api/events"
        "github.com/containerd/log"
)

const (
        // TaskCreateEventTopic for task create
        TaskCreateEventTopic = "/tasks/create"
        // TaskStartEventTopic for task start
        TaskStartEventTopic = "/tasks/start"
        // TaskOOMEventTopic for task oom
        TaskOOMEventTopic = "/tasks/oom"
        // TaskExitEventTopic for task exit
        TaskExitEventTopic = "/tasks/exit"
        // TaskDeleteEventTopic for task delete
        TaskDeleteEventTopic = "/tasks/delete"
        // TaskExecAddedEventTopic for task exec create
        TaskExecAddedEventTopic = "/tasks/exec-added"
        // TaskExecStartedEventTopic for task exec start
        TaskExecStartedEventTopic = "/tasks/exec-started"
        // TaskPausedEventTopic for task pause
        TaskPausedEventTopic = "/tasks/paused"
        // TaskResumedEventTopic for task resume
        TaskResumedEventTopic = "/tasks/resumed"
        // TaskCheckpointedEventTopic for task checkpoint
        TaskCheckpointedEventTopic = "/tasks/checkpointed"
        // TaskUnknownTopic for unknown task events
        TaskUnknownTopic = "/tasks/?"
)

// GetTopic converts an event from an interface type to the specific
// event topic id
func GetTopic(e interface{}) string {
        switch e.(type) {
        case *events.TaskCreate:
                return TaskCreateEventTopic
        case *events.TaskStart:
                return TaskStartEventTopic
        case *events.TaskOOM:
                return TaskOOMEventTopic
        case *events.TaskExit:
                return TaskExitEventTopic
        case *events.TaskDelete:
                return TaskDeleteEventTopic
        case *events.TaskExecAdded:
                return TaskExecAddedEventTopic
        case *events.TaskExecStarted:
                return TaskExecStartedEventTopic
        case *events.TaskPaused:
                return TaskPausedEventTopic
        case *events.TaskResumed:
                return TaskResumedEventTopic
        case *events.TaskCheckpointed:
                return TaskCheckpointedEventTopic
        default:
                log.L.Warnf("no topic for type %#v", e)
        }
        return TaskUnknownTopic
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package runtime

// TaskMonitor provides an interface for monitoring of containers within containerd
type TaskMonitor interface {
        // Monitor adds the provided container to the monitor.
        // Labels are optional (can be nil) key value pairs to be added to the metrics namespace.
        Monitor(task Task, labels map[string]string) error
        // Stop stops and removes the provided container from the monitor
        Stop(task Task) error
}

// NewMultiTaskMonitor returns a new TaskMonitor broadcasting to the provided monitors
func NewMultiTaskMonitor(monitors ...TaskMonitor) TaskMonitor {
        return &multiTaskMonitor{
                monitors: monitors,
        }
}

// NewNoopMonitor is a task monitor that does nothing
func NewNoopMonitor() TaskMonitor {
        return &noopTaskMonitor{}
}

type noopTaskMonitor struct {
}

func (mm *noopTaskMonitor) Monitor(c Task, labels map[string]string) error {
        return nil
}

func (mm *noopTaskMonitor) Stop(c Task) error {
        return nil
}

type multiTaskMonitor struct {
        monitors []TaskMonitor
}

func (mm *multiTaskMonitor) Monitor(task Task, labels map[string]string) error {
        for _, m := range mm.monitors {
                if err := m.Monitor(task, labels); err != nil {
                        return err
                }
        }
        return nil
}

func (mm *multiTaskMonitor) Stop(c Task) error {
        for _, m := range mm.monitors {
                if err := m.Stop(c); err != nil {
                        return err
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package runtime

import (
        "context"
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
)

type object interface {
        ID() string
}

// NSMap extends Map type with a notion of namespaces passed via Context.
type NSMap[T object] struct {
        mu      sync.RWMutex
        objects map[string]map[string]T
}

// NewNSMap returns a new NSMap
func NewNSMap[T object]() *NSMap[T] {
        return &NSMap[T]{
                objects: make(map[string]map[string]T),
        }
}

// Get a task
func (m *NSMap[T]) Get(ctx context.Context, id string) (T, error) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        var t T
        if err != nil {
                return t, err
        }

        m.mu.RLock()
        defer m.mu.RUnlock()
        tasks, ok := m.objects[namespace]
        if !ok {
                return t, errdefs.ErrNotFound
        }
        t, ok = tasks[id]
        if !ok {
                return t, errdefs.ErrNotFound
        }
        return t, nil
}

// GetAll objects under a namespace
func (m *NSMap[T]) GetAll(ctx context.Context, noNS bool) ([]T, error) {
        m.mu.RLock()
        defer m.mu.RUnlock()
        var o []T
        if noNS {
                for ns := range m.objects {
                        for _, t := range m.objects[ns] {
                                o = append(o, t)
                        }
                }
                return o, nil
        }
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }
        tasks, ok := m.objects[namespace]
        if !ok {
                return o, nil
        }
        for _, t := range tasks {
                o = append(o, t)
        }
        return o, nil
}

// Add a task
func (m *NSMap[T]) Add(ctx context.Context, t T) error {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }
        return m.AddWithNamespace(namespace, t)
}

// AddWithNamespace adds a task with the provided namespace
func (m *NSMap[T]) AddWithNamespace(namespace string, t T) error {
        id := t.ID()

        m.mu.Lock()
        defer m.mu.Unlock()
        if _, ok := m.objects[namespace]; !ok {
                m.objects[namespace] = make(map[string]T)
        }
        if _, ok := m.objects[namespace][id]; ok {
                return fmt.Errorf("%s: %w", id, errdefs.ErrAlreadyExists)
        }
        m.objects[namespace][id] = t
        return nil
}

// Delete a task
func (m *NSMap[T]) Delete(ctx context.Context, id string) {
        namespace, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return
        }

        m.mu.Lock()
        defer m.mu.Unlock()
        tasks, ok := m.objects[namespace]
        if ok {
                delete(tasks, id)
        }
}

func (m *NSMap[T]) IsEmpty() bool {
        m.mu.RLock()
        defer m.mu.RUnlock()

        for ns := range m.objects {
                if len(m.objects[ns]) > 0 {
                        return false
                }
        }

        return true
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package restart enables containers to have labels added and monitored to
// keep the container's task running if it is killed.
//
// Setting the StatusLabel on a container instructs the restart monitor to keep
// that container's task in a specific status.
// Setting the LogPathLabel on a container will setup the task's IO to be redirected
// to a log file when running a task within the restart manager.
//
// The restart labels can be cleared off of a container using the WithNoRestarts Opt.
//
// The restart monitor has one option in the containerd config under the [plugins.restart]
// section.  `interval = "10s" sets the reconcile interval that the restart monitor checks
// for task state and reconciles the desired status for that task.
package restart

import (
        "context"
        "fmt"
        "net/url"
        "strconv"
        "strings"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/log"
)

const (
        // StatusLabel sets the restart status label for a container
        StatusLabel = "containerd.io/restart.status"
        // LogURILabel sets the restart log uri label for a container
        LogURILabel = "containerd.io/restart.loguri"

        // PolicyLabel sets the restart policy label for a container
        PolicyLabel = "containerd.io/restart.policy"
        // CountLabel sets the restart count label for a container
        CountLabel = "containerd.io/restart.count"
        // ExplicitlyStoppedLabel sets the restart explicitly stopped label for a container
        ExplicitlyStoppedLabel = "containerd.io/restart.explicitly-stopped"
)

// Policy represents the restart policies of a container.
type Policy struct {
        name              string
        maximumRetryCount int
}

// NewPolicy creates a restart policy with the specified name.
// supports the following restart policies:
// - no, Do not restart the container.
// - always, Always restart the container regardless of the exit status.
// - on-failure[:max-retries], Restart only if the container exits with a non-zero exit status.
// - unless-stopped, Always restart the container unless it is stopped.
func NewPolicy(policy string) (*Policy, error) {
        policySlice := strings.Split(policy, ":")
        var (
                err        error
                retryCount int
        )
        switch policySlice[0] {
        case "", "no", "always", "unless-stopped":
                policy = policySlice[0]
                if policy == "" {
                        policy = "always"
                }
                if len(policySlice) > 1 {
                        return nil, fmt.Errorf("restart policy %q not support max retry count", policySlice[0])
                }
        case "on-failure":
                policy = policySlice[0]
                if len(policySlice) > 1 {
                        retryCount, err = strconv.Atoi(policySlice[1])
                        if err != nil {
                                return nil, fmt.Errorf("invalid max retry count: %s", policySlice[1])
                        }
                }
        default:
                return nil, fmt.Errorf("restart policy %q not supported", policy)
        }
        return &Policy{
                name:              policy,
                maximumRetryCount: retryCount,
        }, nil
}

func (rp *Policy) String() string {
        if rp.maximumRetryCount > 0 {
                return fmt.Sprintf("%s:%d", rp.name, rp.maximumRetryCount)
        }
        return rp.name
}

func (rp *Policy) Name() string {
        return rp.name
}

func (rp *Policy) MaximumRetryCount() int {
        return rp.maximumRetryCount
}

// Reconcile reconciles the restart policy of a container.
func Reconcile(status containerd.Status, labels map[string]string) bool {
        rp, err := NewPolicy(labels[PolicyLabel])
        if err != nil {
                log.L.WithError(err).Error("policy reconcile")
                return false
        }
        switch rp.Name() {
        case "", "always":
                return true
        case "on-failure":
                restartCount, err := strconv.Atoi(labels[CountLabel])
                if err != nil && labels[CountLabel] != "" {
                        log.L.WithError(err).Error("policy reconcile")
                        return false
                }
                if status.ExitStatus != 0 && (rp.maximumRetryCount == 0 || restartCount < rp.maximumRetryCount) {
                        return true
                }
        case "unless-stopped":
                explicitlyStopped, _ := strconv.ParseBool(labels[ExplicitlyStoppedLabel])
                if !explicitlyStopped {
                        return true
                }
        }
        return false
}

// WithLogURI sets the specified log uri for a container.
func WithLogURI(uri *url.URL) func(context.Context, *containerd.Client, *containers.Container) error {
        return WithLogURIString(uri.String())
}

// WithLogURIString sets the specified log uri string for a container.
func WithLogURIString(uriString string) func(context.Context, *containerd.Client, *containers.Container) error {
        return func(_ context.Context, _ *containerd.Client, c *containers.Container) error {
                ensureLabels(c)
                c.Labels[LogURILabel] = uriString
                return nil
        }
}

// WithStatus sets the status for a container
func WithStatus(status containerd.ProcessStatus) func(context.Context, *containerd.Client, *containers.Container) error {
        return func(_ context.Context, _ *containerd.Client, c *containers.Container) error {
                ensureLabels(c)
                c.Labels[StatusLabel] = string(status)
                return nil
        }
}

// WithPolicy sets the restart policy for a container
func WithPolicy(policy *Policy) func(context.Context, *containerd.Client, *containers.Container) error {
        return func(_ context.Context, _ *containerd.Client, c *containers.Container) error {
                ensureLabels(c)
                c.Labels[PolicyLabel] = policy.String()
                return nil
        }
}

// WithNoRestarts clears any restart information from the container
func WithNoRestarts(_ context.Context, _ *containerd.Client, c *containers.Container) error {
        if c.Labels == nil {
                return nil
        }
        delete(c.Labels, StatusLabel)
        delete(c.Labels, PolicyLabel)
        delete(c.Labels, LogURILabel)
        return nil
}

func ensureLabels(c *containers.Container) {
        if c.Labels == nil {
                c.Labels = make(map[string]string)
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package runtime

import (
        "strconv"

        "github.com/containerd/typeurl/v2"
        specs "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/runtime-spec/specs-go/features"
)

func init() {
        const prefix = "types.containerd.io"
        // register TypeUrls for commonly marshaled external types
        major := strconv.Itoa(specs.VersionMajor)
        typeurl.Register(&specs.Spec{}, prefix, "opencontainers/runtime-spec", major, "Spec")
        typeurl.Register(&specs.Process{}, prefix, "opencontainers/runtime-spec", major, "Process")
        typeurl.Register(&specs.LinuxResources{}, prefix, "opencontainers/runtime-spec", major, "LinuxResources")
        typeurl.Register(&specs.WindowsResources{}, prefix, "opencontainers/runtime-spec", major, "WindowsResources")
        typeurl.Register(&features.Features{}, prefix, "opencontainers/runtime-spec", major, "features", "Features")
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "bytes"
        "context"
        "fmt"
        "io"
        "os"
        "path/filepath"
        gruntime "runtime"

        "github.com/containerd/containerd/api/runtime/task/v2"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        client "github.com/containerd/containerd/v2/pkg/shim"
        "github.com/containerd/log"
)

type shimBinaryConfig struct {
        runtime      string
        address      string
        ttrpcAddress string
        schedCore    bool
}

func shimBinary(bundle *Bundle, config shimBinaryConfig) *binary {
        return &binary{
                bundle:                 bundle,
                runtime:                config.runtime,
                containerdAddress:      config.address,
                containerdTTRPCAddress: config.ttrpcAddress,
                schedCore:              config.schedCore,
        }
}

type binary struct {
        runtime                string
        containerdAddress      string
        containerdTTRPCAddress string
        schedCore              bool
        bundle                 *Bundle
}

func (b *binary) Start(ctx context.Context, opts *types.Any, onClose func()) (_ *shim, err error) {
        args := []string{"-id", b.bundle.ID}
        switch log.GetLevel() {
        case log.DebugLevel, log.TraceLevel:
                args = append(args, "-debug")
        }
        args = append(args, "start")

        cmd, err := client.Command(
                ctx,
                &client.CommandConfig{
                        Runtime:      b.runtime,
                        Address:      b.containerdAddress,
                        TTRPCAddress: b.containerdTTRPCAddress,
                        Path:         b.bundle.Path,
                        Opts:         opts,
                        Args:         args,
                        SchedCore:    b.schedCore,
                })
        if err != nil {
                return nil, err
        }
        // Windows needs a namespace when openShimLog
        ns, _ := namespaces.Namespace(ctx)
        shimCtx, cancelShimLog := context.WithCancel(namespaces.WithNamespace(context.Background(), ns))
        defer func() {
                if err != nil {
                        cancelShimLog()
                }
        }()
        f, err := openShimLog(shimCtx, b.bundle, client.AnonDialer)
        if err != nil {
                return nil, fmt.Errorf("open shim log pipe: %w", err)
        }
        defer func() {
                if err != nil {
                        f.Close()
                }
        }()
        // open the log pipe and block until the writer is ready
        // this helps with synchronization of the shim
        // copy the shim's logs to containerd's output
        go func() {
                defer f.Close()
                _, err := io.Copy(os.Stderr, f)
                // To prevent flood of error messages, the expected error
                // should be reset, like os.ErrClosed or os.ErrNotExist, which
                // depends on platform.
                err = checkCopyShimLogError(ctx, err)
                if err != nil {
                        log.G(ctx).WithError(err).Error("copy shim log")
                }
        }()
        out, err := cmd.CombinedOutput()
        if err != nil {
                return nil, fmt.Errorf("%s: %w", out, err)
        }
        response := bytes.TrimSpace(out)

        onCloseWithShimLog := func() {
                onClose()
                cancelShimLog()
                f.Close()
        }
        // Save runtime binary path for restore.
        if err := os.WriteFile(filepath.Join(b.bundle.Path, "shim-binary-path"), []byte(b.runtime), 0600); err != nil {
                return nil, err
        }

        params, err := parseStartResponse(response)
        if err != nil {
                return nil, err
        }

        conn, err := makeConnection(ctx, b.bundle.ID, params, onCloseWithShimLog)
        if err != nil {
                return nil, err
        }

        // Save bootstrap configuration (so containerd can restore shims after restart).
        if err := writeBootstrapParams(filepath.Join(b.bundle.Path, "bootstrap.json"), params); err != nil {
                return nil, fmt.Errorf("failed to write bootstrap.json: %w", err)
        }
        // The address is in the form like ttrpc+unix://<uds-path> or grpc+vsock://<cid>:<port>
        address := fmt.Sprintf("%s+%s", params.Protocol, params.Address)
        return &shim{
                bundle:  b.bundle,
                client:  conn,
                address: address,
                version: params.Version,
        }, nil
}

func (b *binary) Delete(ctx context.Context) (*runtime.Exit, error) {
        log.G(ctx).Info("cleaning up dead shim")

        // On Windows and FreeBSD, the current working directory of the shim should
        // not be the bundle path during the delete operation. Instead, we invoke
        // with the default work dir and forward the bundle path on the cmdline.
        // Windows cannot delete the current working directory while an executable
        // is in use with it. On FreeBSD, fork/exec can fail.
        var bundlePath string
        if gruntime.GOOS != "windows" && gruntime.GOOS != "freebsd" {
                bundlePath = b.bundle.Path
        }
        args := []string{
                "-id", b.bundle.ID,
                "-bundle", b.bundle.Path,
        }
        switch log.GetLevel() {
        case log.DebugLevel, log.TraceLevel:
                args = append(args, "-debug")
        }
        args = append(args, "delete")

        cmd, err := client.Command(ctx,
                &client.CommandConfig{
                        Runtime:      b.runtime,
                        Address:      b.containerdAddress,
                        TTRPCAddress: b.containerdTTRPCAddress,
                        Path:         bundlePath,
                        Opts:         nil,
                        Args:         args,
                })

        if err != nil {
                return nil, err
        }
        var (
                out  = bytes.NewBuffer(nil)
                errb = bytes.NewBuffer(nil)
        )
        cmd.Stdout = out
        cmd.Stderr = errb
        if err := cmd.Run(); err != nil {
                log.G(ctx).WithField("cmd", cmd).WithError(err).Error("failed to delete")
                return nil, fmt.Errorf("%s: %w", errb.String(), err)
        }
        s := errb.String()
        if s != "" {
                log.G(ctx).Warnf("cleanup warnings %s", s)
        }
        var response task.DeleteResponse
        if err := proto.Unmarshal(out.Bytes(), &response); err != nil {
                return nil, err
        }
        if err := b.bundle.Delete(); err != nil {
                return nil, err
        }
        return &runtime.Exit{
                Status:    response.ExitStatus,
                Timestamp: protobuf.FromTimestamp(response.ExitedAt),
                Pid:       response.Pid,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "fmt"

        "github.com/containerd/ttrpc"
        "google.golang.org/grpc"
        "google.golang.org/protobuf/types/known/emptypb"

        v2 "github.com/containerd/containerd/api/runtime/task/v2"
        v3 "github.com/containerd/containerd/api/runtime/task/v3"

        api "github.com/containerd/containerd/api/runtime/task/v3" // Current version used by TaskServiceClient
)

// TaskServiceClient exposes a client interface to shims, which aims to hide
// the underlying complexity and backward compatibility (v2 task service vs v3, TTRPC vs GRPC, etc).
type TaskServiceClient interface {
        State(context.Context, *api.StateRequest) (*api.StateResponse, error)
        Create(context.Context, *api.CreateTaskRequest) (*api.CreateTaskResponse, error)
        Start(context.Context, *api.StartRequest) (*api.StartResponse, error)
        Delete(context.Context, *api.DeleteRequest) (*api.DeleteResponse, error)
        Pids(context.Context, *api.PidsRequest) (*api.PidsResponse, error)
        Pause(context.Context, *api.PauseRequest) (*emptypb.Empty, error)
        Resume(context.Context, *api.ResumeRequest) (*emptypb.Empty, error)
        Checkpoint(context.Context, *api.CheckpointTaskRequest) (*emptypb.Empty, error)
        Kill(context.Context, *api.KillRequest) (*emptypb.Empty, error)
        Exec(context.Context, *api.ExecProcessRequest) (*emptypb.Empty, error)
        ResizePty(context.Context, *api.ResizePtyRequest) (*emptypb.Empty, error)
        CloseIO(context.Context, *api.CloseIORequest) (*emptypb.Empty, error)
        Update(context.Context, *api.UpdateTaskRequest) (*emptypb.Empty, error)
        Wait(context.Context, *api.WaitRequest) (*api.WaitResponse, error)
        Stats(context.Context, *api.StatsRequest) (*api.StatsResponse, error)
        Connect(context.Context, *api.ConnectRequest) (*api.ConnectResponse, error)
        Shutdown(context.Context, *api.ShutdownRequest) (*emptypb.Empty, error)
}

// NewTaskClient returns a new task client interface which handles both GRPC and TTRPC servers depending on the
// client object type passed in.
//
// Supported client types are:
// - *ttrpc.Client
// - grpc.ClientConnInterface
//
// Currently supported servers:
// - TTRPC v2 (compatibility with shims before 2.0)
// - TTRPC v3
// - GRPC v3
func NewTaskClient(client interface{}, version int) (TaskServiceClient, error) {
        switch c := client.(type) {
        case *ttrpc.Client:
                switch version {
                case 2:
                        return &ttrpcV2Bridge{client: v2.NewTaskClient(c)}, nil
                case 3:
                        return v3.NewTTRPCTaskClient(c), nil
                default:
                        return nil, fmt.Errorf("containerd client supports only v2 and v3 TTRPC task client (got %d)", version)
                }

        case grpc.ClientConnInterface:
                if version != 3 {
                        return nil, fmt.Errorf("containerd client supports only v3 GRPC task service (got %d)", version)
                }

                return &grpcV3Bridge{v3.NewTaskClient(c)}, nil
        default:
                return nil, fmt.Errorf("unsupported shim client type %T", c)
        }
}

// ttrpcV2Bridge is a bridge from TTRPC v2 task service.
type ttrpcV2Bridge struct {
        client v2.TaskService
}

var _ TaskServiceClient = (*ttrpcV2Bridge)(nil)

func (b *ttrpcV2Bridge) State(ctx context.Context, request *api.StateRequest) (*api.StateResponse, error) {
        resp, err := b.client.State(ctx, &v2.StateRequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
        })

        return &v3.StateResponse{
                ID:         resp.GetID(),
                Bundle:     resp.GetBundle(),
                Pid:        resp.GetPid(),
                Status:     resp.GetStatus(),
                Stdin:      resp.GetStdin(),
                Stdout:     resp.GetStdout(),
                Stderr:     resp.GetStderr(),
                Terminal:   resp.GetTerminal(),
                ExitStatus: resp.GetExitStatus(),
                ExitedAt:   resp.GetExitedAt(),
                ExecID:     resp.GetExecID(),
        }, err
}

func (b *ttrpcV2Bridge) Create(ctx context.Context, request *api.CreateTaskRequest) (*api.CreateTaskResponse, error) {
        resp, err := b.client.Create(ctx, &v2.CreateTaskRequest{
                ID:               request.GetID(),
                Bundle:           request.GetBundle(),
                Rootfs:           request.GetRootfs(),
                Terminal:         request.GetTerminal(),
                Stdin:            request.GetStdin(),
                Stdout:           request.GetStdout(),
                Stderr:           request.GetStderr(),
                Checkpoint:       request.GetCheckpoint(),
                ParentCheckpoint: request.GetParentCheckpoint(),
                Options:          request.GetOptions(),
        })

        return &api.CreateTaskResponse{Pid: resp.GetPid()}, err
}

func (b *ttrpcV2Bridge) Start(ctx context.Context, request *api.StartRequest) (*api.StartResponse, error) {
        resp, err := b.client.Start(ctx, &v2.StartRequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
        })

        return &api.StartResponse{Pid: resp.GetPid()}, err
}

func (b *ttrpcV2Bridge) Delete(ctx context.Context, request *api.DeleteRequest) (*api.DeleteResponse, error) {
        resp, err := b.client.Delete(ctx, &v2.DeleteRequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
        })

        return &api.DeleteResponse{
                Pid:        resp.GetPid(),
                ExitStatus: resp.GetExitStatus(),
                ExitedAt:   resp.GetExitedAt(),
        }, err
}

func (b *ttrpcV2Bridge) Pids(ctx context.Context, request *api.PidsRequest) (*api.PidsResponse, error) {
        resp, err := b.client.Pids(ctx, &v2.PidsRequest{ID: request.GetID()})
        return &api.PidsResponse{Processes: resp.GetProcesses()}, err
}

func (b *ttrpcV2Bridge) Pause(ctx context.Context, request *api.PauseRequest) (*emptypb.Empty, error) {
        return b.client.Pause(ctx, &v2.PauseRequest{ID: request.GetID()})
}

func (b *ttrpcV2Bridge) Resume(ctx context.Context, request *api.ResumeRequest) (*emptypb.Empty, error) {
        return b.client.Resume(ctx, &v2.ResumeRequest{ID: request.GetID()})
}

func (b *ttrpcV2Bridge) Checkpoint(ctx context.Context, request *api.CheckpointTaskRequest) (*emptypb.Empty, error) {
        return b.client.Checkpoint(ctx, &v2.CheckpointTaskRequest{
                ID:      request.GetID(),
                Path:    request.GetPath(),
                Options: request.GetOptions(),
        })
}

func (b *ttrpcV2Bridge) Kill(ctx context.Context, request *api.KillRequest) (*emptypb.Empty, error) {
        return b.client.Kill(ctx, &v2.KillRequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
                Signal: request.GetSignal(),
                All:    request.GetAll(),
        })
}

func (b *ttrpcV2Bridge) Exec(ctx context.Context, request *api.ExecProcessRequest) (*emptypb.Empty, error) {
        return b.client.Exec(ctx, &v2.ExecProcessRequest{
                ID:       request.GetID(),
                ExecID:   request.GetExecID(),
                Terminal: request.GetTerminal(),
                Stdin:    request.GetStdin(),
                Stdout:   request.GetStdout(),
                Stderr:   request.GetStderr(),
                Spec:     request.GetSpec(),
        })
}

func (b *ttrpcV2Bridge) ResizePty(ctx context.Context, request *api.ResizePtyRequest) (*emptypb.Empty, error) {
        return b.client.ResizePty(ctx, &v2.ResizePtyRequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
                Width:  request.GetWidth(),
                Height: request.GetHeight(),
        })
}

func (b *ttrpcV2Bridge) CloseIO(ctx context.Context, request *api.CloseIORequest) (*emptypb.Empty, error) {
        return b.client.CloseIO(ctx, &v2.CloseIORequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
                Stdin:  request.GetStdin(),
        })
}

func (b *ttrpcV2Bridge) Update(ctx context.Context, request *api.UpdateTaskRequest) (*emptypb.Empty, error) {
        return b.client.Update(ctx, &v2.UpdateTaskRequest{
                ID:          request.GetID(),
                Resources:   request.GetResources(),
                Annotations: request.GetAnnotations(),
        })
}

func (b *ttrpcV2Bridge) Wait(ctx context.Context, request *api.WaitRequest) (*api.WaitResponse, error) {
        resp, err := b.client.Wait(ctx, &v2.WaitRequest{
                ID:     request.GetID(),
                ExecID: request.GetExecID(),
        })

        return &api.WaitResponse{
                ExitStatus: resp.GetExitStatus(),
                ExitedAt:   resp.GetExitedAt(),
        }, err
}

func (b *ttrpcV2Bridge) Stats(ctx context.Context, request *api.StatsRequest) (*api.StatsResponse, error) {
        resp, err := b.client.Stats(ctx, &v2.StatsRequest{ID: request.GetID()})
        return &api.StatsResponse{Stats: resp.GetStats()}, err
}

func (b *ttrpcV2Bridge) Connect(ctx context.Context, request *api.ConnectRequest) (*api.ConnectResponse, error) {
        resp, err := b.client.Connect(ctx, &v2.ConnectRequest{ID: request.GetID()})

        return &api.ConnectResponse{
                ShimPid: resp.GetShimPid(),
                TaskPid: resp.GetTaskPid(),
                Version: resp.GetVersion(),
        }, err
}

func (b *ttrpcV2Bridge) Shutdown(ctx context.Context, request *api.ShutdownRequest) (*emptypb.Empty, error) {
        return b.client.Shutdown(ctx, &v2.ShutdownRequest{
                ID:  request.GetID(),
                Now: request.GetNow(),
        })
}

// grpcV3Bridge implements task service client for v3 GRPC server.
// GRPC uses same request/response structures as TTRPC, so it just wraps GRPC calls.
type grpcV3Bridge struct {
        client v3.TaskClient
}

var _ TaskServiceClient = (*grpcV3Bridge)(nil)

func (g *grpcV3Bridge) State(ctx context.Context, request *api.StateRequest) (*api.StateResponse, error) {
        return g.client.State(ctx, request)
}

func (g *grpcV3Bridge) Create(ctx context.Context, request *api.CreateTaskRequest) (*api.CreateTaskResponse, error) {
        return g.client.Create(ctx, request)
}

func (g *grpcV3Bridge) Start(ctx context.Context, request *api.StartRequest) (*api.StartResponse, error) {
        return g.client.Start(ctx, request)
}

func (g *grpcV3Bridge) Delete(ctx context.Context, request *api.DeleteRequest) (*api.DeleteResponse, error) {
        return g.client.Delete(ctx, request)
}

func (g *grpcV3Bridge) Pids(ctx context.Context, request *api.PidsRequest) (*api.PidsResponse, error) {
        return g.client.Pids(ctx, request)
}

func (g *grpcV3Bridge) Pause(ctx context.Context, request *api.PauseRequest) (*emptypb.Empty, error) {
        return g.client.Pause(ctx, request)
}

func (g *grpcV3Bridge) Resume(ctx context.Context, request *api.ResumeRequest) (*emptypb.Empty, error) {
        return g.client.Resume(ctx, request)
}

func (g *grpcV3Bridge) Checkpoint(ctx context.Context, request *api.CheckpointTaskRequest) (*emptypb.Empty, error) {
        return g.client.Checkpoint(ctx, request)
}

func (g *grpcV3Bridge) Kill(ctx context.Context, request *api.KillRequest) (*emptypb.Empty, error) {
        return g.client.Kill(ctx, request)
}

func (g *grpcV3Bridge) Exec(ctx context.Context, request *api.ExecProcessRequest) (*emptypb.Empty, error) {
        return g.client.Exec(ctx, request)
}

func (g *grpcV3Bridge) ResizePty(ctx context.Context, request *api.ResizePtyRequest) (*emptypb.Empty, error) {
        return g.client.ResizePty(ctx, request)
}

func (g *grpcV3Bridge) CloseIO(ctx context.Context, request *api.CloseIORequest) (*emptypb.Empty, error) {
        return g.client.CloseIO(ctx, request)
}

func (g *grpcV3Bridge) Update(ctx context.Context, request *api.UpdateTaskRequest) (*emptypb.Empty, error) {
        return g.client.Update(ctx, request)
}

func (g *grpcV3Bridge) Wait(ctx context.Context, request *api.WaitRequest) (*api.WaitResponse, error) {
        return g.client.Wait(ctx, request)
}

func (g *grpcV3Bridge) Stats(ctx context.Context, request *api.StatsRequest) (*api.StatsResponse, error) {
        return g.client.Stats(ctx, request)
}

func (g *grpcV3Bridge) Connect(ctx context.Context, request *api.ConnectRequest) (*api.ConnectResponse, error) {
        return g.client.Connect(ctx, request)
}

func (g *grpcV3Bridge) Shutdown(ctx context.Context, request *api.ShutdownRequest) (*emptypb.Empty, error) {
        return g.client.Shutdown(ctx, request)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "runtime"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/identifiers"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/typeurl/v2"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// LoadBundle loads an existing bundle from disk
func LoadBundle(ctx context.Context, root, id string) (*Bundle, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }
        return &Bundle{
                ID:        id,
                Path:      filepath.Join(root, ns, id),
                Namespace: ns,
        }, nil
}

// NewBundle returns a new bundle on disk
func NewBundle(ctx context.Context, root, state, id string, spec typeurl.Any) (b *Bundle, err error) {
        if err := identifiers.Validate(id); err != nil {
                return nil, fmt.Errorf("invalid task id %s: %w", id, err)
        }

        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }
        work := filepath.Join(root, ns, id)
        b = &Bundle{
                ID:        id,
                Path:      filepath.Join(state, ns, id),
                Namespace: ns,
        }
        var paths []string
        defer func() {
                if err != nil {
                        for _, d := range paths {
                                os.RemoveAll(d)
                        }
                }
        }()
        // create state directory for the bundle
        if err := os.MkdirAll(filepath.Dir(b.Path), 0711); err != nil {
                return nil, err
        }
        if err := os.Mkdir(b.Path, 0700); err != nil {
                return nil, err
        }
        if typeurl.Is(spec, &specs.Spec{}) {
                if err := prepareBundleDirectoryPermissions(b.Path, spec.GetValue()); err != nil {
                        return nil, err
                }
        }
        paths = append(paths, b.Path)
        // create working directory for the bundle
        if err := os.MkdirAll(filepath.Dir(work), 0711); err != nil {
                return nil, err
        }
        rootfs := filepath.Join(b.Path, "rootfs")
        if err := os.MkdirAll(rootfs, 0711); err != nil {
                return nil, err
        }
        paths = append(paths, rootfs)
        if err := os.Mkdir(work, 0711); err != nil {
                if !os.IsExist(err) {
                        return nil, err
                }
                os.RemoveAll(work)
                if err := os.Mkdir(work, 0711); err != nil {
                        return nil, err
                }
        }
        paths = append(paths, work)
        // symlink workdir
        if err := os.Symlink(work, filepath.Join(b.Path, "work")); err != nil {
                return nil, err
        }
        if spec := spec.GetValue(); spec != nil {
                // write the spec to the bundle
                specPath := filepath.Join(b.Path, oci.ConfigFilename)
                err = os.WriteFile(specPath, spec, 0666)
                if err != nil {
                        return nil, fmt.Errorf("failed to write bundle spec: %w", err)
                }
        }
        return b, nil
}

// Bundle represents an OCI bundle
type Bundle struct {
        // ID of the bundle
        ID string
        // Path to the bundle
        Path string
        // Namespace of the bundle
        Namespace string
}

// Delete a bundle atomically
func (b *Bundle) Delete() error {
        work, werr := os.Readlink(filepath.Join(b.Path, "work"))
        rootfs := filepath.Join(b.Path, "rootfs")
        if runtime.GOOS != "darwin" {
                if err := mount.UnmountRecursive(rootfs, 0); err != nil {
                        return fmt.Errorf("unmount rootfs %s: %w", rootfs, err)
                }
        }
        if err := os.Remove(rootfs); err != nil && !os.IsNotExist(err) {
                return fmt.Errorf("failed to remove bundle rootfs: %w", err)
        }
        err := atomicDelete(b.Path)
        if err == nil {
                if werr == nil {
                        return atomicDelete(work)
                }
                return nil
        }
        // error removing the bundle path; still attempt removing work dir
        var err2 error
        if werr == nil {
                err2 = atomicDelete(work)
                if err2 == nil {
                        return err
                }
        }
        return fmt.Errorf("failed to remove both bundle and workdir locations: %v: %w", err2, err)
}

// atomicDelete renames the path to a hidden file before removal
func atomicDelete(path string) error {
        // create a hidden dir for an atomic removal
        atomicPath := filepath.Join(filepath.Dir(path), fmt.Sprintf(".%s", filepath.Base(path)))
        if err := os.Rename(path, atomicPath); err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return err
        }
        return os.RemoveAll(atomicPath)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "encoding/json"
        "os"

        "github.com/opencontainers/runtime-spec/specs-go"
)

// prepareBundleDirectoryPermissions prepares the permissions of the bundle
// directory according to the needs of the current platform.
// On Linux when user namespaces are enabled, the permissions are modified to
// allow the remapped root GID to access the bundle.
func prepareBundleDirectoryPermissions(path string, spec []byte) error {
        gid, err := remappedGID(spec)
        if err != nil {
                return err
        }
        if gid == 0 {
                return nil
        }
        if err := os.Chown(path, -1, int(gid)); err != nil {
                return err
        }
        return os.Chmod(path, 0710)
}

// ociSpecUserNS is a subset of specs.Spec used to reduce garbage during
// unmarshal.
type ociSpecUserNS struct {
        Linux *linuxSpecUserNS
}

// linuxSpecUserNS is a subset of specs.Linux used to reduce garbage during
// unmarshal.
type linuxSpecUserNS struct {
        GIDMappings []specs.LinuxIDMapping
}

// remappedGID reads the remapped GID 0 from the OCI spec, if it exists. If
// there is no remapping, remappedGID returns 0. If the spec cannot be parsed,
// remappedGID returns an error.
func remappedGID(spec []byte) (uint32, error) {
        var ociSpec ociSpecUserNS
        err := json.Unmarshal(spec, &ociSpec)
        if err != nil {
                return 0, err
        }
        if ociSpec.Linux == nil || len(ociSpec.Linux.GIDMappings) == 0 {
                return 0, nil
        }
        for _, mapping := range ociSpec.Linux.GIDMappings {
                if mapping.ContainerID == 0 {
                        return mapping.HostID, nil
                }
        }
        return 0, nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "github.com/containerd/platforms"
)

func defaultPlatforms() []string {
        return []string{platforms.DefaultString()}
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "errors"

        task "github.com/containerd/containerd/api/runtime/task/v3"
        tasktypes "github.com/containerd/containerd/api/types/task"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/ttrpc"
)

type process struct {
        id   string
        shim *shimTask
}

func (p *process) ID() string {
        return p.id
}

func (p *process) Kill(ctx context.Context, signal uint32, _ bool) error {
        _, err := p.shim.task.Kill(ctx, &task.KillRequest{
                Signal: signal,
                ID:     p.shim.ID(),
                ExecID: p.id,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func statusFromProto(from tasktypes.Status) runtime.Status {
        var status runtime.Status
        switch from {
        case tasktypes.Status_CREATED:
                status = runtime.CreatedStatus
        case tasktypes.Status_RUNNING:
                status = runtime.RunningStatus
        case tasktypes.Status_STOPPED:
                status = runtime.StoppedStatus
        case tasktypes.Status_PAUSED:
                status = runtime.PausedStatus
        case tasktypes.Status_PAUSING:
                status = runtime.PausingStatus
        }
        return status
}

func (p *process) State(ctx context.Context) (runtime.State, error) {
        response, err := p.shim.task.State(ctx, &task.StateRequest{
                ID:     p.shim.ID(),
                ExecID: p.id,
        })
        if err != nil {
                if !errors.Is(err, ttrpc.ErrClosed) {
                        return runtime.State{}, errdefs.FromGRPC(err)
                }
                return runtime.State{}, errdefs.ErrNotFound
        }
        return runtime.State{
                Pid:        response.Pid,
                Status:     statusFromProto(response.Status),
                Stdin:      response.Stdin,
                Stdout:     response.Stdout,
                Stderr:     response.Stderr,
                Terminal:   response.Terminal,
                ExitStatus: response.ExitStatus,
                ExitedAt:   protobuf.FromTimestamp(response.ExitedAt),
        }, nil
}

// ResizePty changes the side of the process's PTY to the provided width and height
func (p *process) ResizePty(ctx context.Context, size runtime.ConsoleSize) error {
        _, err := p.shim.task.ResizePty(ctx, &task.ResizePtyRequest{
                ID:     p.shim.ID(),
                ExecID: p.id,
                Width:  size.Width,
                Height: size.Height,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

// CloseIO closes the provided IO pipe for the process
func (p *process) CloseIO(ctx context.Context) error {
        _, err := p.shim.task.CloseIO(ctx, &task.CloseIORequest{
                ID:     p.shim.ID(),
                ExecID: p.id,
                Stdin:  true,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

// Start the process
func (p *process) Start(ctx context.Context) error {
        _, err := p.shim.task.Start(ctx, &task.StartRequest{
                ID:     p.shim.ID(),
                ExecID: p.id,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

// Wait on the process to exit and return the exit status and timestamp
func (p *process) Wait(ctx context.Context) (*runtime.Exit, error) {
        response, err := p.shim.task.Wait(ctx, &task.WaitRequest{
                ID:     p.shim.ID(),
                ExecID: p.id,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return &runtime.Exit{
                Timestamp: protobuf.FromTimestamp(response.ExitedAt),
                Status:    response.ExitStatus,
        }, nil
}

func (p *process) Delete(ctx context.Context) (*runtime.Exit, error) {
        response, err := p.shim.task.Delete(ctx, &task.DeleteRequest{
                ID:     p.shim.ID(),
                ExecID: p.id,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return &runtime.Exit{
                Status:    response.ExitStatus,
                Timestamp: protobuf.FromTimestamp(response.ExitedAt),
                Pid:       response.Pid,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net"
        "os"
        "path/filepath"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/pkg/atomicfile"
        "github.com/containerd/containerd/v2/pkg/dialer"
        "github.com/containerd/ttrpc"
        "google.golang.org/grpc"
        "google.golang.org/grpc/connectivity"
        "google.golang.org/grpc/credentials/insecure"

        eventstypes "github.com/containerd/containerd/api/events"
        task "github.com/containerd/containerd/api/runtime/task/v3"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/pkg/identifiers"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        client "github.com/containerd/containerd/v2/pkg/shim"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

const (
        loadTimeout     = "io.containerd.timeout.shim.load"
        cleanupTimeout  = "io.containerd.timeout.shim.cleanup"
        shutdownTimeout = "io.containerd.timeout.shim.shutdown"
)

func init() {
        timeout.Set(loadTimeout, 5*time.Second)
        timeout.Set(cleanupTimeout, 5*time.Second)
        timeout.Set(shutdownTimeout, 3*time.Second)
}

func loadShim(ctx context.Context, bundle *Bundle, onClose func()) (_ ShimInstance, retErr error) {
        shimCtx, cancelShimLog := context.WithCancel(ctx)
        defer func() {
                if retErr != nil {
                        cancelShimLog()
                }
        }()
        f, err := openShimLog(shimCtx, bundle, client.AnonReconnectDialer)
        if err != nil {
                return nil, fmt.Errorf("open shim log pipe when reload: %w", err)
        }
        defer func() {
                if retErr != nil {
                        f.Close()
                }
        }()
        // open the log pipe and block until the writer is ready
        // this helps with synchronization of the shim
        // copy the shim's logs to containerd's output
        go func() {
                defer f.Close()
                _, err := io.Copy(os.Stderr, f)
                // To prevent flood of error messages, the expected error
                // should be reset, like os.ErrClosed or os.ErrNotExist, which
                // depends on platform.
                err = checkCopyShimLogError(ctx, err)
                if err != nil {
                        log.G(ctx).WithError(err).Error("copy shim log after reload")
                }
        }()
        onCloseWithShimLog := func() {
                onClose()
                cancelShimLog()
                f.Close()
        }

        params, err := restoreBootstrapParams(bundle.Path)
        if err != nil {
                return nil, fmt.Errorf("failed to read boostrap.json when restoring bundle %q: %w", bundle.ID, err)
        }

        conn, err := makeConnection(ctx, bundle.ID, params, onCloseWithShimLog)
        if err != nil {
                return nil, fmt.Errorf("unable to make connection: %w", err)
        }

        defer func() {
                if retErr != nil {
                        conn.Close()
                }
        }()

        // The address is in the form like ttrpc+unix://<uds-path> or grpc+vsock://<cid>:<port>
        address := fmt.Sprintf("%s+%s", params.Protocol, params.Address)

        shim := &shim{
                bundle:  bundle,
                client:  conn,
                address: address,
                version: params.Version,
        }

        return shim, nil
}

func cleanupAfterDeadShim(ctx context.Context, id string, rt *runtime.NSMap[ShimInstance], events *exchange.Exchange, binaryCall *binary) {
        ctx, cancel := timeout.WithContext(ctx, cleanupTimeout)
        defer cancel()

        log.G(ctx).WithField("id", id).Warn("cleaning up after shim disconnected")
        response, err := binaryCall.Delete(ctx)
        if err != nil {
                log.G(ctx).WithError(err).WithField("id", id).Warn("failed to clean up after shim disconnected")
        }

        if _, err := rt.Get(ctx, id); err != nil {
                // Task was never started or was already successfully deleted
                // No need to publish events
                return
        }

        var (
                pid        uint32
                exitStatus uint32
                exitedAt   time.Time
        )
        if response != nil {
                pid = response.Pid
                exitStatus = response.Status
                exitedAt = response.Timestamp
        } else {
                exitStatus = 255
                exitedAt = time.Now()
        }
        events.Publish(ctx, runtime.TaskExitEventTopic, &eventstypes.TaskExit{
                ContainerID: id,
                ID:          id,
                Pid:         pid,
                ExitStatus:  exitStatus,
                ExitedAt:    protobuf.ToTimestamp(exitedAt),
        })

        events.Publish(ctx, runtime.TaskDeleteEventTopic, &eventstypes.TaskDelete{
                ContainerID: id,
                Pid:         pid,
                ExitStatus:  exitStatus,
                ExitedAt:    protobuf.ToTimestamp(exitedAt),
        })
}

// CurrentShimVersion is the latest shim version supported by containerd (e.g. TaskService v3).
const CurrentShimVersion = 3

// ShimInstance represents running shim process managed by ShimManager.
type ShimInstance interface {
        io.Closer

        // ID of the shim.
        ID() string
        // Namespace of this shim.
        Namespace() string
        // Bundle is a file system path to shim's bundle.
        Bundle() string
        // Client returns the underlying TTRPC or GRPC client object for this shim.
        // The underlying object can be either *ttrpc.Client or grpc.ClientConnInterface.
        Client() any
        // Delete will close the client and remove bundle from disk.
        Delete(ctx context.Context) error
        // Endpoint returns shim's endpoint information,
        // including address and version.
        Endpoint() (string, int)
}

func parseStartResponse(response []byte) (client.BootstrapParams, error) {
        var params client.BootstrapParams

        if err := json.Unmarshal(response, &params); err != nil || params.Version < 2 {
                // Use TTRPC for legacy shims
                params.Address = string(response)
                params.Protocol = "ttrpc"
                params.Version = 2
        }

        if params.Version > CurrentShimVersion {
                return client.BootstrapParams{}, fmt.Errorf("unsupported shim version (%d): %w", params.Version, errdefs.ErrNotImplemented)
        }

        return params, nil
}

// writeBootstrapParams writes shim's bootstrap configuration (e.g. how to connect, version, etc).
func writeBootstrapParams(path string, params client.BootstrapParams) error {
        path, err := filepath.Abs(path)
        if err != nil {
                return err
        }

        data, err := json.Marshal(&params)
        if err != nil {
                return err
        }

        f, err := atomicfile.New(path, 0o666)
        if err != nil {
                return err
        }

        _, err = f.Write(data)
        if err != nil {
                f.Cancel()
                return err
        }

        return f.Close()
}

func readBootstrapParams(path string) (client.BootstrapParams, error) {
        path, err := filepath.Abs(path)
        if err != nil {
                return client.BootstrapParams{}, err
        }

        data, err := os.ReadFile(path)
        if err != nil {
                return client.BootstrapParams{}, err
        }

        return parseStartResponse(data)
}

// makeConnection creates a new TTRPC or GRPC connection object from address.
// address can be either a socket path for TTRPC or JSON serialized BootstrapParams.
func makeConnection(ctx context.Context, id string, params client.BootstrapParams, onClose func()) (_ io.Closer, retErr error) {
        log.G(ctx).WithFields(log.Fields{
                "address":  params.Address,
                "protocol": params.Protocol,
                "version":  params.Version,
        }).Infof("connecting to shim %s", id)

        switch strings.ToLower(params.Protocol) {
        case "ttrpc":
                conn, err := client.Connect(params.Address, client.AnonReconnectDialer)
                if err != nil {
                        return nil, fmt.Errorf("failed to create TTRPC connection: %w", err)
                }
                defer func() {
                        if retErr != nil {
                                conn.Close()
                        }
                }()

                return ttrpc.NewClient(conn, ttrpc.WithOnClose(onClose)), nil
        case "grpc":
                ctx, cancel := context.WithTimeout(ctx, time.Second*100)
                defer cancel()

                gopts := []grpc.DialOption{
                        grpc.WithTransportCredentials(insecure.NewCredentials()),
                        grpc.WithBlock(),
                }
                return grpcDialContext(ctx, params.Address, onClose, gopts...)
        default:
                return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol)
        }
}

// grpcDialContext and the underlying grpcConn type exist solely
// so we can have something similar to ttrpc.WithOnClose to have
// a callback run when the connection is severed or explicitly closed.
func grpcDialContext(
        ctx context.Context,
        address string,
        onClose func(),
        gopts ...grpc.DialOption,
) (*grpcConn, error) {
        // If grpc.WithBlock is specified in gopts this causes the connection to block waiting for
        // a connection regardless of if the socket exists or has a listener when Dial begins. This
        // specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't
        // there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is
        // also undesirable as we don't want the background connection behavior, we want to ensure
        // a connection before moving on. To bring this in line with the ttrpc connection behavior
        // lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice
        // here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return
        // ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout.
        //
        // This is especially helpful on restart of containerd as if the shim died while containerd
        // was down, we end up waiting the full timeout.
        conn, err := net.DialTimeout("unix", address, time.Second*10)
        if err != nil {
                return nil, err
        }
        conn.Close()

        target := dialer.DialAddress(address)
        client, err := grpc.DialContext(ctx, target, gopts...)
        if err != nil {
                return nil, fmt.Errorf("failed to create GRPC connection: %w", err)
        }

        done := make(chan struct{})
        go func() {
                gctx := context.Background()
                sourceState := connectivity.Ready
                for {
                        if client.WaitForStateChange(gctx, sourceState) {
                                state := client.GetState()
                                if state == connectivity.Idle || state == connectivity.Shutdown {
                                        break
                                }
                                // Could be transient failure. Lets see if we can get back to a working
                                // state.
                                log.G(gctx).WithFields(log.Fields{
                                        "state": state,
                                        "addr":  target,
                                }).Warn("shim grpc connection unexpected state")
                                sourceState = state
                        }
                }
                onClose()
                close(done)
        }()

        return &grpcConn{
                ClientConn:  client,
                onCloseDone: done,
        }, nil
}

type grpcConn struct {
        *grpc.ClientConn
        onCloseDone chan struct{}
}

func (gc *grpcConn) UserOnCloseWait(ctx context.Context) error {
        select {
        case <-gc.onCloseDone:
                return nil
        case <-ctx.Done():
                return ctx.Err()
        }
}

type shim struct {
        bundle  *Bundle
        client  any
        address string
        version int
}

var _ ShimInstance = (*shim)(nil)

// ID of the shim/task
func (s *shim) ID() string {
        return s.bundle.ID
}

func (s *shim) Endpoint() (string, int) {
        return s.address, s.version
}

func (s *shim) Namespace() string {
        return s.bundle.Namespace
}

func (s *shim) Bundle() string {
        return s.bundle.Path
}

func (s *shim) Client() any {
        return s.client
}

// Close closes the underlying client connection.
func (s *shim) Close() error {
        if ttrpcClient, ok := s.client.(*ttrpc.Client); ok {
                return ttrpcClient.Close()
        }

        if grpcClient, ok := s.client.(*grpcConn); ok {
                return grpcClient.Close()
        }

        return nil
}

func (s *shim) Delete(ctx context.Context) error {
        var result []error

        if ttrpcClient, ok := s.client.(*ttrpc.Client); ok {
                if err := ttrpcClient.Close(); err != nil {
                        result = append(result, fmt.Errorf("failed to close ttrpc client: %w", err))
                }

                if err := ttrpcClient.UserOnCloseWait(ctx); err != nil {
                        result = append(result, fmt.Errorf("close wait error: %w", err))
                }
        }

        if grpcClient, ok := s.client.(*grpcConn); ok {
                if err := grpcClient.Close(); err != nil {
                        result = append(result, fmt.Errorf("failed to close grpc client: %w", err))
                }

                if err := grpcClient.UserOnCloseWait(ctx); err != nil {
                        result = append(result, fmt.Errorf("close wait error: %w", err))
                }
        }

        if err := s.bundle.Delete(); err != nil {
                log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to delete bundle")
                result = append(result, fmt.Errorf("failed to delete bundle: %w", err))
        }

        return errors.Join(result...)
}

var _ runtime.Task = &shimTask{}

// shimTask wraps shim process and adds task service client for compatibility with existing shim manager.
type shimTask struct {
        ShimInstance
        task TaskServiceClient
}

func newShimTask(shim ShimInstance) (*shimTask, error) {
        _, version := shim.Endpoint()
        taskClient, err := NewTaskClient(shim.Client(), version)
        if err != nil {
                return nil, err
        }

        return &shimTask{
                ShimInstance: shim,
                task:         taskClient,
        }, nil
}

func (s *shimTask) Shutdown(ctx context.Context) error {
        _, err := s.task.Shutdown(ctx, &task.ShutdownRequest{
                ID: s.ID(),
        })
        if err != nil && !errors.Is(err, ttrpc.ErrClosed) {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) waitShutdown(ctx context.Context) error {
        ctx, cancel := timeout.WithContext(ctx, shutdownTimeout)
        defer cancel()
        return s.Shutdown(ctx)
}

// PID of the task
func (s *shimTask) PID(ctx context.Context) (uint32, error) {
        response, err := s.task.Connect(ctx, &task.ConnectRequest{
                ID: s.ID(),
        })
        if err != nil {
                return 0, errdefs.FromGRPC(err)
        }

        return response.TaskPid, nil
}

func (s *shimTask) delete(ctx context.Context, sandboxed bool, removeTask func(ctx context.Context, id string)) (*runtime.Exit, error) {
        response, shimErr := s.task.Delete(ctx, &task.DeleteRequest{
                ID: s.ID(),
        })
        if shimErr != nil {
                log.G(ctx).WithField("id", s.ID()).WithError(shimErr).Debug("failed to delete task")
                if !errors.Is(shimErr, ttrpc.ErrClosed) {
                        shimErr = errdefs.FromGRPC(shimErr)
                        if !errdefs.IsNotFound(shimErr) {
                                return nil, shimErr
                        }
                }
        }

        // NOTE: If the shim has been killed and ttrpc connection has been
        // closed, the shimErr will not be nil. For this case, the event
        // subscriber, like moby/moby, might have received the exit or delete
        // events. Just in case, we should allow ttrpc-callback-on-close to
        // send the exit and delete events again. And the exit status will
        // depend on result of shimV2.Delete.
        //
        // If not, the shim has been delivered the exit and delete events.
        // So we should remove the record and prevent duplicate events from
        // ttrpc-callback-on-close.
        //
        // TODO: It's hard to guarantee that the event is unique and sent only
        // once. The moby/moby should not rely on that assumption that there is
        // only one exit event. The moby/moby should handle the duplicate events.
        //
        // REF: https://github.com/containerd/containerd/issues/4769
        if shimErr == nil {
                removeTask(ctx, s.ID())
        }

        // Don't shutdown sandbox as there may be other containers running.
        // Let controller decide when to shutdown.
        if !sandboxed {
                if err := s.waitShutdown(ctx); err != nil {
                        // FIXME(fuweid):
                        //
                        // If the error is context canceled, should we use context.TODO()
                        // to wait for it?
                        log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim task and the shim might be leaked")
                }
        }

        if err := s.ShimInstance.Delete(ctx); err != nil {
                log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to delete shim")
        }

        // remove self from the runtime task list
        // this seems dirty but it cleans up the API across runtimes, tasks, and the service
        removeTask(ctx, s.ID())

        if shimErr != nil {
                return nil, shimErr
        }

        return &runtime.Exit{
                Status:    response.ExitStatus,
                Timestamp: protobuf.FromTimestamp(response.ExitedAt),
                Pid:       response.Pid,
        }, nil
}

func (s *shimTask) Create(ctx context.Context, opts runtime.CreateOpts) (runtime.Task, error) {
        topts := opts.TaskOptions
        if topts == nil || topts.GetValue() == nil {
                topts = opts.RuntimeOptions
        }
        request := &task.CreateTaskRequest{
                ID:         s.ID(),
                Bundle:     s.Bundle(),
                Stdin:      opts.IO.Stdin,
                Stdout:     opts.IO.Stdout,
                Stderr:     opts.IO.Stderr,
                Terminal:   opts.IO.Terminal,
                Checkpoint: opts.Checkpoint,
                Options:    protobuf.FromAny(topts),
        }
        for _, m := range opts.Rootfs {
                request.Rootfs = append(request.Rootfs, &types.Mount{
                        Type:    m.Type,
                        Source:  m.Source,
                        Target:  m.Target,
                        Options: m.Options,
                })
        }

        _, err := s.task.Create(ctx, request)
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        return s, nil
}

func (s *shimTask) Pause(ctx context.Context) error {
        if _, err := s.task.Pause(ctx, &task.PauseRequest{
                ID: s.ID(),
        }); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Resume(ctx context.Context) error {
        if _, err := s.task.Resume(ctx, &task.ResumeRequest{
                ID: s.ID(),
        }); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Start(ctx context.Context) error {
        _, err := s.task.Start(ctx, &task.StartRequest{
                ID: s.ID(),
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Kill(ctx context.Context, signal uint32, all bool) error {
        if _, err := s.task.Kill(ctx, &task.KillRequest{
                ID:     s.ID(),
                Signal: signal,
                All:    all,
        }); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Exec(ctx context.Context, id string, opts runtime.ExecOpts) (runtime.ExecProcess, error) {
        if err := identifiers.Validate(id); err != nil {
                return nil, fmt.Errorf("invalid exec id %s: %w", id, err)
        }
        request := &task.ExecProcessRequest{
                ID:       s.ID(),
                ExecID:   id,
                Stdin:    opts.IO.Stdin,
                Stdout:   opts.IO.Stdout,
                Stderr:   opts.IO.Stderr,
                Terminal: opts.IO.Terminal,
                Spec:     opts.Spec,
        }
        if _, err := s.task.Exec(ctx, request); err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return &process{
                id:   id,
                shim: s,
        }, nil
}

func (s *shimTask) Pids(ctx context.Context) ([]runtime.ProcessInfo, error) {
        resp, err := s.task.Pids(ctx, &task.PidsRequest{
                ID: s.ID(),
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        var processList []runtime.ProcessInfo
        for _, p := range resp.Processes {
                processList = append(processList, runtime.ProcessInfo{
                        Pid:  p.Pid,
                        Info: p.Info,
                })
        }
        return processList, nil
}

func (s *shimTask) ResizePty(ctx context.Context, size runtime.ConsoleSize) error {
        _, err := s.task.ResizePty(ctx, &task.ResizePtyRequest{
                ID:     s.ID(),
                Width:  size.Width,
                Height: size.Height,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) CloseIO(ctx context.Context) error {
        _, err := s.task.CloseIO(ctx, &task.CloseIORequest{
                ID:    s.ID(),
                Stdin: true,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Wait(ctx context.Context) (*runtime.Exit, error) {
        taskPid, err := s.PID(ctx)
        if err != nil {
                return nil, err
        }
        response, err := s.task.Wait(ctx, &task.WaitRequest{
                ID: s.ID(),
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return &runtime.Exit{
                Pid:       taskPid,
                Timestamp: protobuf.FromTimestamp(response.ExitedAt),
                Status:    response.ExitStatus,
        }, nil
}

func (s *shimTask) Checkpoint(ctx context.Context, path string, options *ptypes.Any) error {
        request := &task.CheckpointTaskRequest{
                ID:      s.ID(),
                Path:    path,
                Options: options,
        }
        if _, err := s.task.Checkpoint(ctx, request); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Update(ctx context.Context, resources *ptypes.Any, annotations map[string]string) error {
        if _, err := s.task.Update(ctx, &task.UpdateTaskRequest{
                ID:          s.ID(),
                Resources:   resources,
                Annotations: annotations,
        }); err != nil {
                return errdefs.FromGRPC(err)
        }
        return nil
}

func (s *shimTask) Stats(ctx context.Context) (*ptypes.Any, error) {
        response, err := s.task.Stats(ctx, &task.StatsRequest{
                ID: s.ID(),
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return response.Stats, nil
}

func (s *shimTask) Process(ctx context.Context, id string) (runtime.ExecProcess, error) {
        p := &process{
                id:   id,
                shim: s,
        }
        if _, err := p.State(ctx); err != nil {
                return nil, err
        }
        return p, nil
}

func (s *shimTask) State(ctx context.Context) (runtime.State, error) {
        response, err := s.task.State(ctx, &task.StateRequest{
                ID: s.ID(),
        })
        if err != nil {
                if !errors.Is(err, ttrpc.ErrClosed) {
                        return runtime.State{}, errdefs.FromGRPC(err)
                }
                return runtime.State{}, errdefs.ErrNotFound
        }
        return runtime.State{
                Pid:        response.Pid,
                Status:     statusFromProto(response.Status),
                Stdin:      response.Stdin,
                Stdout:     response.Stdout,
                Stderr:     response.Stderr,
                Terminal:   response.Terminal,
                ExitStatus: response.ExitStatus,
                ExitedAt:   protobuf.FromTimestamp(response.ExitedAt),
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "errors"
        "fmt"
        "os"
        "path/filepath"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/internal/cleanup"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/timeout"
)

// LoadExistingShims loads existing shims from the path specified by stateDir
// rootDir is for cleaning up the unused paths of removed shims.
func (m *ShimManager) LoadExistingShims(ctx context.Context, stateDir string, rootDir string) error {
        nsDirs, err := os.ReadDir(stateDir)
        if err != nil {
                return err
        }
        for _, nsd := range nsDirs {
                if !nsd.IsDir() {
                        continue
                }
                ns := nsd.Name()
                // skip hidden directories
                if len(ns) > 0 && ns[0] == '.' {
                        continue
                }
                log.G(ctx).WithField("namespace", ns).Debug("loading tasks in namespace")
                if err := m.loadShims(namespaces.WithNamespace(ctx, ns), stateDir); err != nil {
                        log.G(ctx).WithField("namespace", ns).WithError(err).Error("loading tasks in namespace")
                        continue
                }
                if err := m.cleanupWorkDirs(namespaces.WithNamespace(ctx, ns), rootDir); err != nil {
                        log.G(ctx).WithField("namespace", ns).WithError(err).Error("cleanup working directory in namespace")
                        continue
                }
        }
        return nil
}

func (m *ShimManager) loadShims(ctx context.Context, stateDir string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }
        ctx = log.WithLogger(ctx, log.G(ctx).WithField("namespace", ns))

        shimDirs, err := os.ReadDir(filepath.Join(stateDir, ns))
        if err != nil {
                return err
        }
        for _, sd := range shimDirs {
                if !sd.IsDir() {
                        continue
                }
                id := sd.Name()
                // skip hidden directories
                if len(id) > 0 && id[0] == '.' {
                        continue
                }
                bundle, err := LoadBundle(ctx, stateDir, id)
                if err != nil {
                        // fine to return error here, it is a programmer error if the context
                        // does not have a namespace
                        return err
                }
                // fast path
                f, err := os.Open(bundle.Path)
                if err != nil {
                        bundle.Delete()
                        log.G(ctx).WithError(err).Errorf("fast path read bundle path for %s", bundle.Path)
                        continue
                }

                bf, err := f.Readdirnames(-1)
                f.Close()
                if err != nil {
                        bundle.Delete()
                        log.G(ctx).WithError(err).Errorf("fast path read bundle path for %s", bundle.Path)
                        continue
                }
                if len(bf) == 0 {
                        bundle.Delete()
                        continue
                }
                if err := m.loadShim(ctx, bundle); err != nil {
                        log.G(ctx).WithError(err).Errorf("failed to load shim %s", bundle.Path)
                        bundle.Delete()
                        continue
                }

        }
        return nil
}

func (m *ShimManager) loadShim(ctx context.Context, bundle *Bundle) error {
        var (
                runtime string
                id      = bundle.ID
        )

        // If we're on 1.6+ and specified custom path to the runtime binary, path will be saved in 'shim-binary-path' file.
        if data, err := os.ReadFile(filepath.Join(bundle.Path, "shim-binary-path")); err == nil {
                runtime = string(data)
        } else if err != nil && !os.IsNotExist(err) {
                log.G(ctx).WithError(err).Error("failed to read `runtime` path from bundle")
        }

        // Query runtime name from metadata store
        if runtime == "" {
                container, err := m.containers.Get(ctx, id)
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("loading container %s", id)
                        if err := mount.UnmountRecursive(filepath.Join(bundle.Path, "rootfs"), 0); err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to unmount of rootfs %s", id)
                        }
                        return err
                }
                runtime = container.Runtime.Name
        }

        runtime, err := m.resolveRuntimePath(runtime)
        if err != nil {
                bundle.Delete()

                return fmt.Errorf("failed to resolve runtime path: %w", err)
        }

        binaryCall := shimBinary(bundle,
                shimBinaryConfig{
                        runtime:      runtime,
                        address:      m.containerdAddress,
                        ttrpcAddress: m.containerdTTRPCAddress,
                        schedCore:    m.schedCore,
                })
        // TODO: It seems we can only call loadShim here if it is a sandbox shim?
        shim, err := loadShimTask(ctx, bundle, func() {
                log.G(ctx).WithField("id", id).Info("shim disconnected")

                cleanupAfterDeadShim(cleanup.Background(ctx), id, m.shims, m.events, binaryCall)
                // Remove self from the runtime task list.
                m.shims.Delete(ctx, id)
        })
        if err != nil {
                cleanupAfterDeadShim(ctx, id, m.shims, m.events, binaryCall)
                return fmt.Errorf("unable to load shim %q: %w", id, err)
        }

        // There are 3 possibilities for the loaded shim here:
        // 1. It could be a shim that is running a task.
        // 2. It could be a sandbox shim.
        // 3. Or it could be a shim that was created for running a task but
        // something happened (probably a containerd crash) and the task was never
        // created. This shim process should be cleaned up here. Look at
        // containerd/containerd#6860 for further details.

        _, sgetErr := m.sandboxStore.Get(ctx, id)
        pInfo, pidErr := shim.Pids(ctx)
        if sgetErr != nil && errors.Is(sgetErr, errdefs.ErrNotFound) && (len(pInfo) == 0 || errors.Is(pidErr, errdefs.ErrNotFound)) {
                log.G(ctx).WithField("id", id).Info("cleaning leaked shim process")
                // We are unable to get Pids from the shim and it's not a sandbox
                // shim. We should clean it up her.
                // No need to do anything for removeTask since we never added this shim.
                shim.delete(ctx, false, func(ctx context.Context, id string) {})
        } else {
                m.shims.Add(ctx, shim.ShimInstance)
        }
        return nil
}

func loadShimTask(ctx context.Context, bundle *Bundle, onClose func()) (_ *shimTask, retErr error) {
        shim, err := loadShim(ctx, bundle, onClose)
        if err != nil {
                return nil, err
        }
        // Check connectivity, TaskService is the only required service, so create a temp one to check connection.
        s, err := newShimTask(shim)
        if err != nil {
                return nil, err
        }

        ctx, cancel := timeout.WithContext(ctx, loadTimeout)
        defer cancel()

        if _, err := s.PID(ctx); err != nil {
                return nil, err
        }
        return s, nil
}

func (m *ShimManager) cleanupWorkDirs(ctx context.Context, rootDir string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        f, err := os.Open(filepath.Join(rootDir, ns))
        if err != nil {
                return err
        }
        defer f.Close()

        dirs, err := f.Readdirnames(-1)
        if err != nil {
                return err
        }

        for _, dir := range dirs {
                // if the task was not loaded, cleanup and empty working directory
                // this can happen on a reboot where /run for the bundle state is cleaned up
                // but that persistent working dir is left
                if _, err := m.shims.Get(ctx, dir); err != nil {
                        path := filepath.Join(rootDir, ns, dir)
                        if err := os.RemoveAll(path); err != nil {
                                log.G(ctx).WithError(err).Errorf("cleanup working dir %s", path)
                        }
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "errors"
        "fmt"
        "os"
        "os/exec"
        "path/filepath"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cleanup"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        shimbinary "github.com/containerd/containerd/v2/pkg/shim"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/version"
)

// Config for the shim
type Config struct {
        // Supported platforms
        Platforms []string `toml:"platforms"`
        // SchedCore enabled linux core scheduling
        SchedCore bool `toml:"sched_core"`
}

func init() {
        // ShimManager is not only for TaskManager,
        // the "shim" sandbox controller also use it to manage shims,
        // so we make it an independent plugin
        registry.Register(&plugin.Registration{
                Type: plugins.ShimPlugin,
                ID:   "shim",
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                        plugins.MetadataPlugin,
                },
                Config: &Config{
                        Platforms: defaultPlatforms(),
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        config := ic.Config.(*Config)
                        supportedPlatforms, err := platforms.ParseAll(config.Platforms)
                        if err != nil {
                                return nil, err
                        }
                        ic.Meta.Platforms = supportedPlatforms

                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        ep, err := ic.GetByID(plugins.EventPlugin, "exchange")
                        if err != nil {
                                return nil, err
                        }
                        events := ep.(*exchange.Exchange)
                        cs := metadata.NewContainerStore(m.(*metadata.DB))
                        ss := metadata.NewSandboxStore(m.(*metadata.DB))
                        return NewShimManager(&ManagerConfig{
                                Address:      ic.Properties[plugins.PropertyGRPCAddress],
                                TTRPCAddress: ic.Properties[plugins.PropertyTTRPCAddress],
                                Events:       events,
                                Store:        cs,
                                SchedCore:    config.SchedCore,
                                SandboxStore: ss,
                        })
                },
                ConfigMigration: func(ctx context.Context, configVersion int, pluginConfigs map[string]interface{}) error {
                        // Migrate configurations from io.containerd.runtime.v2.task
                        // if the configVersion >= 3 please make sure the config is under io.containerd.shim.v1.shim.
                        if configVersion >= version.ConfigVersion {
                                return nil
                        }
                        const originalPluginName = string(plugins.RuntimePluginV2) + ".task"
                        original, ok := pluginConfigs[originalPluginName]
                        if !ok {
                                return nil
                        }
                        const newPluginName = string(plugins.ShimPlugin) + ".shim"
                        pluginConfigs[originalPluginName] = nil
                        pluginConfigs[newPluginName] = original
                        return nil
                },
        })
}

type ManagerConfig struct {
        Store        containers.Store
        Events       *exchange.Exchange
        Address      string
        TTRPCAddress string
        SchedCore    bool
        SandboxStore sandbox.Store
}

// NewShimManager creates a manager for v2 shims
func NewShimManager(config *ManagerConfig) (*ShimManager, error) {
        m := &ShimManager{
                containerdAddress:      config.Address,
                containerdTTRPCAddress: config.TTRPCAddress,
                shims:                  runtime.NewNSMap[ShimInstance](),
                events:                 config.Events,
                containers:             config.Store,
                schedCore:              config.SchedCore,
                sandboxStore:           config.SandboxStore,
        }

        return m, nil
}

// ShimManager manages currently running shim processes.
// It is mainly responsible for launching new shims and for proper shutdown and cleanup of existing instances.
// The manager is unaware of the underlying services shim provides and lets higher level services consume them,
// but don't care about lifecycle management.
type ShimManager struct {
        containerdAddress      string
        containerdTTRPCAddress string
        schedCore              bool
        shims                  *runtime.NSMap[ShimInstance]
        events                 *exchange.Exchange
        containers             containers.Store
        // runtimePaths is a cache of `runtime names` -> `resolved fs path`
        runtimePaths sync.Map
        sandboxStore sandbox.Store
}

// ID of the shim manager
func (m *ShimManager) ID() string {
        return plugins.ShimPlugin.String() + ".shim"
}

// Start launches a new shim instance
func (m *ShimManager) Start(ctx context.Context, id string, bundle *Bundle, opts runtime.CreateOpts) (_ ShimInstance, retErr error) {
        // This container belongs to sandbox which supposed to be already started via sandbox API.
        if opts.SandboxID != "" {
                var params shimbinary.BootstrapParams
                if opts.Address != "" {
                        // The address returned from sandbox controller should be in the form like ttrpc+unix://<uds-path>
                        // or grpc+vsock://<cid>:<port>, we should get the protocol from the url first.
                        protocol, address, ok := strings.Cut(opts.Address, "+")
                        if !ok {
                                return nil, fmt.Errorf("the scheme of sandbox address should be in " +
                                        " the form of <protocol>+<unix|vsock|tcp>, i.e. ttrpc+unix or grpc+vsock")
                        }
                        params = shimbinary.BootstrapParams{
                                Version:  int(opts.Version),
                                Protocol: protocol,
                                Address:  address,
                        }
                } else {
                        // For those sandbox we can not get endpoint,
                        // fallback to legacy implementation
                        process, err := m.Get(ctx, opts.SandboxID)
                        if err != nil {
                                return nil, fmt.Errorf("can't find sandbox %s", opts.SandboxID)
                        }
                        p, restoreErr := restoreBootstrapParams(process.Bundle())
                        if restoreErr != nil {
                                return nil, fmt.Errorf("failed to get bootstrap "+
                                        "params of sandbox %s, %v, legacy restore error %v", opts.SandboxID, err, restoreErr)
                        }
                        params = p
                }

                // Write sandbox ID this task belongs to.
                if err := os.WriteFile(filepath.Join(bundle.Path, "sandbox"), []byte(opts.SandboxID), 0600); err != nil {
                        return nil, err
                }

                if err := writeBootstrapParams(filepath.Join(bundle.Path, "bootstrap.json"), params); err != nil {
                        return nil, fmt.Errorf("failed to write bootstrap.json for bundle %s: %w", bundle.Path, err)
                }

                shim, err := loadShim(ctx, bundle, func() {})
                if err != nil {
                        return nil, fmt.Errorf("failed to load sandbox task %q: %w", opts.SandboxID, err)
                }

                if err := m.shims.Add(ctx, shim); err != nil {
                        return nil, err
                }

                return shim, nil
        }

        shim, err := m.startShim(ctx, bundle, id, opts)
        if err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        m.cleanupShim(ctx, shim)
                }
        }()

        if err := m.shims.Add(ctx, shim); err != nil {
                return nil, fmt.Errorf("failed to add task: %w", err)
        }

        return shim, nil
}

func (m *ShimManager) startShim(ctx context.Context, bundle *Bundle, id string, opts runtime.CreateOpts) (*shim, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }
        ctx = log.WithLogger(ctx, log.G(ctx).WithField("namespace", ns))

        topts := opts.TaskOptions
        if topts == nil || topts.GetValue() == nil {
                topts = opts.RuntimeOptions
        }

        runtimePath, err := m.resolveRuntimePath(opts.Runtime)
        if err != nil {
                return nil, fmt.Errorf("failed to resolve runtime path: %w", err)
        }

        b := shimBinary(bundle, shimBinaryConfig{
                runtime:      runtimePath,
                address:      m.containerdAddress,
                ttrpcAddress: m.containerdTTRPCAddress,
                schedCore:    m.schedCore,
        })
        shim, err := b.Start(ctx, protobuf.FromAny(topts), func() {
                log.G(ctx).WithField("id", id).Info("shim disconnected")

                cleanupAfterDeadShim(cleanup.Background(ctx), id, m.shims, m.events, b)
                // Remove self from the runtime task list. Even though the cleanupAfterDeadShim()
                // would publish taskExit event, but the shim.Delete() would always failed with ttrpc
                // disconnect and there is no chance to remove this dead task from runtime task lists.
                // Thus it's better to delete it here.
                m.shims.Delete(ctx, id)
        })
        if err != nil {
                return nil, fmt.Errorf("start failed: %w", err)
        }

        return shim, nil
}

// restoreBootstrapParams reads bootstrap.json to restore shim configuration.
// If its an old shim, this will perform migration - read address file and write default bootstrap
// configuration (version = 2, protocol = ttrpc, and address).
func restoreBootstrapParams(bundlePath string) (shimbinary.BootstrapParams, error) {
        filePath := filepath.Join(bundlePath, "bootstrap.json")

        // Read bootstrap.json if exists
        if _, err := os.Stat(filePath); err == nil {
                return readBootstrapParams(filePath)
        } else if !errors.Is(err, os.ErrNotExist) {
                return shimbinary.BootstrapParams{}, fmt.Errorf("failed to stat %s: %w", filePath, err)
        }

        // File not found, likely its an older shim. Try migrate.

        address, err := shimbinary.ReadAddress(filepath.Join(bundlePath, "address"))
        if err != nil {
                return shimbinary.BootstrapParams{}, fmt.Errorf("unable to migrate shim: failed to get socket address for bundle %s: %w", bundlePath, err)
        }

        params := shimbinary.BootstrapParams{
                Version:  2,
                Address:  address,
                Protocol: "ttrpc",
        }

        if err := writeBootstrapParams(filePath, params); err != nil {
                return shimbinary.BootstrapParams{}, fmt.Errorf("unable to migrate: failed to write bootstrap.json file: %w", err)
        }

        return params, nil
}

func (m *ShimManager) resolveRuntimePath(runtime string) (string, error) {
        if runtime == "" {
                return "", fmt.Errorf("no runtime name")
        }

        // Custom path to runtime binary
        if filepath.IsAbs(runtime) {
                // Make sure it exists before returning ok
                if _, err := os.Stat(runtime); err != nil {
                        return "", fmt.Errorf("invalid custom binary path: %w", err)
                }

                return runtime, nil
        }

        // Check if relative path to runtime binary provided
        if strings.Contains(runtime, "/") {
                return "", fmt.Errorf("invalid runtime name %s, correct runtime name should be either format like `io.containerd.runc.v2` or a full path to the binary", runtime)
        }

        // Preserve existing logic and resolve runtime path from runtime name.

        name := shimbinary.BinaryName(runtime)
        if name == "" {
                return "", fmt.Errorf("invalid runtime name %s, correct runtime name should be either format like `io.containerd.runc.v2` or a full path to the binary", runtime)
        }

        if path, ok := m.runtimePaths.Load(name); ok {
                return path.(string), nil
        }

        var (
                cmdPath string
                lerr    error
        )

        binaryPath := shimbinary.BinaryPath(runtime)
        if _, serr := os.Stat(binaryPath); serr == nil {
                cmdPath = binaryPath
        }

        if cmdPath == "" {
                if cmdPath, lerr = exec.LookPath(name); lerr != nil {
                        if eerr, ok := lerr.(*exec.Error); ok {
                                if eerr.Err == exec.ErrNotFound {
                                        self, err := os.Executable()
                                        if err != nil {
                                                return "", err
                                        }

                                        // Match the calling binaries (containerd) path and see
                                        // if they are side by side. If so, execute the shim
                                        // found there.
                                        testPath := filepath.Join(filepath.Dir(self), name)
                                        if _, serr := os.Stat(testPath); serr == nil {
                                                cmdPath = testPath
                                        }
                                        if cmdPath == "" {
                                                return "", fmt.Errorf("runtime %q binary not installed %q: %w", runtime, name, os.ErrNotExist)
                                        }
                                }
                        }
                }
        }

        cmdPath, err := filepath.Abs(cmdPath)
        if err != nil {
                return "", err
        }

        if path, ok := m.runtimePaths.LoadOrStore(name, cmdPath); ok {
                // We didn't store cmdPath we loaded an already cached value. Use it.
                cmdPath = path.(string)
        }

        return cmdPath, nil
}

// cleanupShim attempts to properly delete and cleanup shim after error
func (m *ShimManager) cleanupShim(ctx context.Context, shim *shim) {
        dctx, cancel := timeout.WithContext(cleanup.Background(ctx), cleanupTimeout)
        defer cancel()

        _ = shim.Delete(dctx)
        m.shims.Delete(dctx, shim.ID())
}

func (m *ShimManager) Get(ctx context.Context, id string) (ShimInstance, error) {
        return m.shims.Get(ctx, id)
}

// Delete a runtime task
func (m *ShimManager) Delete(ctx context.Context, id string) error {
        shim, err := m.shims.Get(ctx, id)
        if err != nil {
                return err
        }

        err = shim.Delete(ctx)
        m.shims.Delete(ctx, id)

        return err
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "context"
        "errors"
        "io"
        "net"
        "os"
        "path/filepath"
        "time"

        "github.com/containerd/fifo"
        "golang.org/x/sys/unix"
)

func openShimLog(ctx context.Context, bundle *Bundle, _ func(string, time.Duration) (net.Conn, error)) (io.ReadCloser, error) {
        return fifo.OpenFifo(ctx, filepath.Join(bundle.Path, "log"), unix.O_RDWR|unix.O_CREAT|unix.O_NONBLOCK, 0700)
}

func checkCopyShimLogError(ctx context.Context, err error) error {
        select {
        case <-ctx.Done():
                if err == fifo.ErrReadClosed || errors.Is(err, os.ErrClosed) {
                        return nil
                }
        default:
        }
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package v2

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "os"
        "os/exec"
        "slices"

        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/runtime-spec/specs-go/features"

        apitypes "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/internal/cleanup"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/containerd/v2/plugins"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.RuntimePluginV2,
                ID:   "task",
                Requires: []plugin.Type{
                        plugins.ShimPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        shimManagerI, err := ic.GetByID(plugins.ShimPlugin, "shim")
                        if err != nil {
                                return nil, err
                        }
                        shimManager := shimManagerI.(*ShimManager)
                        root, state := ic.Properties[plugins.PropertyRootDir], ic.Properties[plugins.PropertyStateDir]
                        for _, d := range []string{root, state} {
                                if err := os.MkdirAll(d, 0711); err != nil {
                                        return nil, err
                                }
                        }
                        return NewTaskManager(ic.Context, root, state, shimManager)
                },
        })
}

// TaskManager wraps task service client on top of shim manager.
type TaskManager struct {
        root    string
        state   string
        manager *ShimManager
}

// NewTaskManager creates a new task manager instance.
// root is the rootDir of TaskManager plugin to store persistent data
// state is the stateDir of TaskManager plugin to store transient data
// shims is  ShimManager for TaskManager to create/delete shims
func NewTaskManager(ctx context.Context, root, state string, shims *ShimManager) (*TaskManager, error) {
        if err := shims.LoadExistingShims(ctx, state, root); err != nil {
                return nil, fmt.Errorf("failed to load existing shims for task manager")
        }
        m := &TaskManager{
                root:    root,
                state:   state,
                manager: shims,
        }
        return m, nil
}

// ID of the task manager
func (m *TaskManager) ID() string {
        return plugins.RuntimePluginV2.String() + ".task"
}

// Create launches new shim instance and creates new task
func (m *TaskManager) Create(ctx context.Context, taskID string, opts runtime.CreateOpts) (_ runtime.Task, retErr error) {
        bundle, err := NewBundle(ctx, m.root, m.state, taskID, opts.Spec)
        if err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        bundle.Delete()
                }
        }()

        shim, err := m.manager.Start(ctx, taskID, bundle, opts)
        if err != nil {
                return nil, fmt.Errorf("failed to start shim: %w", err)
        }

        // Cast to shim task and call task service to create a new container task instance.
        // This will not be required once shim service / client implemented.
        shimTask, err := newShimTask(shim)
        if err != nil {
                return nil, err
        }

        // runc ignores silently features it doesn't know about, so for things that this is
        // problematic let's check if this runc version supports them.
        if err := m.validateRuntimeFeatures(ctx, opts); err != nil {
                return nil, fmt.Errorf("failed to validate OCI runtime features: %w", err)
        }

        t, err := shimTask.Create(ctx, opts)
        if err != nil {
                // NOTE: ctx contains required namespace information.
                m.manager.shims.Delete(ctx, taskID)

                dctx, cancel := timeout.WithContext(cleanup.Background(ctx), cleanupTimeout)
                defer cancel()

                sandboxed := opts.SandboxID != ""
                _, errShim := shimTask.delete(dctx, sandboxed, func(context.Context, string) {})
                if errShim != nil {
                        if errdefs.IsDeadlineExceeded(errShim) {
                                dctx, cancel = timeout.WithContext(cleanup.Background(ctx), cleanupTimeout)
                                defer cancel()
                        }

                        shimTask.Shutdown(dctx)
                        shimTask.Close()
                }

                return nil, fmt.Errorf("failed to create shim task: %w", err)
        }

        return t, nil
}

// Get a specific task
func (m *TaskManager) Get(ctx context.Context, id string) (runtime.Task, error) {
        shim, err := m.manager.shims.Get(ctx, id)
        if err != nil {
                return nil, err
        }
        return newShimTask(shim)
}

// Tasks lists all tasks
func (m *TaskManager) Tasks(ctx context.Context, all bool) ([]runtime.Task, error) {
        shims, err := m.manager.shims.GetAll(ctx, all)
        if err != nil {
                return nil, err
        }
        out := make([]runtime.Task, len(shims))
        for i := range shims {
                newClient, err := newShimTask(shims[i])
                if err != nil {
                        return nil, err
                }
                out[i] = newClient
        }
        return out, nil
}

// Delete deletes the task and shim instance
func (m *TaskManager) Delete(ctx context.Context, taskID string) (*runtime.Exit, error) {
        shim, err := m.manager.shims.Get(ctx, taskID)
        if err != nil {
                return nil, err
        }

        container, err := m.manager.containers.Get(ctx, taskID)
        if err != nil {
                return nil, err
        }

        shimTask, err := newShimTask(shim)
        if err != nil {
                return nil, err
        }

        sandboxed := container.SandboxID != ""

        exit, err := shimTask.delete(ctx, sandboxed, func(ctx context.Context, id string) {
                m.manager.shims.Delete(ctx, id)
        })

        if err != nil {
                return nil, fmt.Errorf("failed to delete task: %w", err)
        }

        return exit, nil
}

func (m *TaskManager) PluginInfo(ctx context.Context, request interface{}) (interface{}, error) {
        req, ok := request.(*apitypes.RuntimeRequest)
        if !ok {
                return nil, fmt.Errorf("unknown request type %T: %w", request, errdefs.ErrNotImplemented)
        }

        runtimePath, err := m.manager.resolveRuntimePath(req.RuntimePath)
        if err != nil {
                return nil, fmt.Errorf("failed to resolve runtime path: %w", err)
        }
        var optsB []byte
        if req.Options != nil {
                optsB, err = proto.Marshal(req.Options)
                if err != nil {
                        return nil, fmt.Errorf("failed to marshal %s: %w", req.Options.TypeUrl, err)
                }
        }
        var stderr bytes.Buffer
        cmd := exec.CommandContext(ctx, runtimePath, "-info")
        cmd.Stdin = bytes.NewReader(optsB)
        cmd.Stderr = &stderr
        stdout, err := cmd.Output()
        if err != nil {
                return nil, fmt.Errorf("failed to run %v: %w (stderr: %q)", cmd.Args, err, stderr.String())
        }
        var info apitypes.RuntimeInfo
        if err = proto.Unmarshal(stdout, &info); err != nil {
                return nil, fmt.Errorf("failed to unmarshal stdout from %v into %T: %w", cmd.Args, &info, err)
        }
        return &info, nil
}

func (m *TaskManager) validateRuntimeFeatures(ctx context.Context, opts runtime.CreateOpts) error {
        var spec specs.Spec
        if err := typeurl.UnmarshalTo(opts.Spec, &spec); err != nil {
                return fmt.Errorf("unmarshal spec: %w", err)
        }

        // Only ask for the PluginInfo if idmap mounts are used.
        if !usesIDMapMounts(spec) {
                return nil
        }

        pInfo, err := m.PluginInfo(ctx, &apitypes.RuntimeRequest{RuntimePath: opts.Runtime})
        if err != nil {
                return fmt.Errorf("runtime info: %w", err)
        }

        pluginInfo, ok := pInfo.(*apitypes.RuntimeInfo)
        if !ok {
                return fmt.Errorf("invalid runtime info type: %T", pInfo)
        }

        feat, err := typeurl.UnmarshalAny(pluginInfo.Features)
        if err != nil {
                return fmt.Errorf("unmarshal runtime features: %w", err)
        }

        // runc-compatible runtimes silently ignores features it doesn't know about. But ignoring
        // our request to use idmap mounts can break permissions in the volume, so let's make sure
        // it supports it. For more info, see:
        //        https://github.com/opencontainers/runtime-spec/pull/1219
        //
        features, ok := feat.(*features.Features)
        if !ok {
                // Leave alone non runc-compatible runtimes that don't provide the features info,
                // they might not be affected by this.
                return nil
        }

        if err := supportsIDMapMounts(features); err != nil {
                return fmt.Errorf("idmap mounts not supported: %w", err)
        }

        return nil
}

func usesIDMapMounts(spec specs.Spec) bool {
        for _, m := range spec.Mounts {
                if m.UIDMappings != nil || m.GIDMappings != nil {
                        return true
                }
                if slices.Contains(m.Options, "idmap") || slices.Contains(m.Options, "ridmap") {
                        return true
                }

        }
        return false
}

func supportsIDMapMounts(features *features.Features) error {
        if features.Linux.MountExtensions == nil || features.Linux.MountExtensions.IDMap == nil {
                return errors.New("missing `mountExtensions.idmap` entry in `features` command")
        }
        if enabled := features.Linux.MountExtensions.IDMap.Enabled; enabled == nil || !*enabled {
                return errors.New("entry `mountExtensions.idmap.Enabled` in `features` command not present or disabled")
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "context"
        "fmt"

        "github.com/containerd/ttrpc"
        "google.golang.org/grpc"

        api "github.com/containerd/containerd/api/runtime/sandbox/v1"
)

// NewClient returns a new sandbox client that handles both GRPC and TTRPC clients.
func NewClient(client interface{}) (api.TTRPCSandboxService, error) {
        switch c := client.(type) {
        case *ttrpc.Client:
                return api.NewTTRPCSandboxClient(c), nil
        case grpc.ClientConnInterface:
                return &grpcBridge{api.NewSandboxClient(c)}, nil
        default:
                return nil, fmt.Errorf("unsupported client type %T", client)
        }
}

type grpcBridge struct {
        client api.SandboxClient
}

var _ api.TTRPCSandboxService = (*grpcBridge)(nil)

func (g *grpcBridge) CreateSandbox(ctx context.Context, request *api.CreateSandboxRequest) (*api.CreateSandboxResponse, error) {
        return g.client.CreateSandbox(ctx, request)
}

func (g *grpcBridge) StartSandbox(ctx context.Context, request *api.StartSandboxRequest) (*api.StartSandboxResponse, error) {
        return g.client.StartSandbox(ctx, request)
}

func (g *grpcBridge) Platform(ctx context.Context, request *api.PlatformRequest) (*api.PlatformResponse, error) {
        return g.client.Platform(ctx, request)
}

func (g *grpcBridge) StopSandbox(ctx context.Context, request *api.StopSandboxRequest) (*api.StopSandboxResponse, error) {
        return g.client.StopSandbox(ctx, request)
}

func (g *grpcBridge) WaitSandbox(ctx context.Context, request *api.WaitSandboxRequest) (*api.WaitSandboxResponse, error) {
        return g.client.WaitSandbox(ctx, request)
}

func (g *grpcBridge) SandboxStatus(ctx context.Context, request *api.SandboxStatusRequest) (*api.SandboxStatusResponse, error) {
        return g.client.SandboxStatus(ctx, request)
}

func (g *grpcBridge) PingSandbox(ctx context.Context, request *api.PingRequest) (*api.PingResponse, error) {
        return g.client.PingSandbox(ctx, request)
}

func (g *grpcBridge) ShutdownSandbox(ctx context.Context, request *api.ShutdownSandboxRequest) (*api.ShutdownSandboxResponse, error) {
        return g.client.ShutdownSandbox(ctx, request)
}

func (g *grpcBridge) SandboxMetrics(ctx context.Context, request *api.SandboxMetricsRequest) (*api.SandboxMetricsResponse, error) {
        return g.client.SandboxMetrics(ctx, request)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/platforms"
        "github.com/containerd/typeurl/v2"
)

type CreateOptions struct {
        Rootfs []mount.Mount
        // Options are used to pass arbitrary options to the shim when creating a new sandbox.
        // CRI will use this to pass PodSandboxConfig.
        // Don't confuse this with Runtime options, which are passed at shim instance start
        // to setup global shim configuration.
        Options     typeurl.Any
        NetNSPath   string
        Annotations map[string]string
}

type CreateOpt func(*CreateOptions) error

// WithRootFS is used to create a sandbox with the provided rootfs mount
func WithRootFS(m []mount.Mount) CreateOpt {
        return func(co *CreateOptions) error {
                co.Rootfs = m
                return nil
        }
}

// WithOptions allows passing arbitrary options when creating a new sandbox.
func WithOptions(options any) CreateOpt {
        return func(co *CreateOptions) error {
                var err error
                co.Options, err = typeurl.MarshalAny(options)
                if err != nil {
                        return fmt.Errorf("failed to marshal sandbox options: %w", err)
                }

                return nil
        }
}

// WithNetNSPath used to assign network namespace path of a sandbox.
func WithNetNSPath(netNSPath string) CreateOpt {
        return func(co *CreateOptions) error {
                co.NetNSPath = netNSPath
                return nil
        }
}

// WithAnnotations sets the provided annotations for sandbox creation.
func WithAnnotations(annotations map[string]string) CreateOpt {
        return func(co *CreateOptions) error {
                co.Annotations = annotations
                return nil
        }
}

type StopOptions struct {
        Timeout *time.Duration
}

type StopOpt func(*StopOptions)

func WithTimeout(timeout time.Duration) StopOpt {
        return func(so *StopOptions) {
                so.Timeout = &timeout
        }
}

// Controller is an interface to manage sandboxes at runtime.
// When running in sandbox mode, shim expected to implement `SandboxService`.
// Shim lifetimes are now managed manually via sandbox API by the containerd's client.
type Controller interface {
        // Create is used to initialize sandbox environment. (mounts, any)
        Create(ctx context.Context, sandboxInfo Sandbox, opts ...CreateOpt) error
        // Start will start previously created sandbox.
        Start(ctx context.Context, sandboxID string) (ControllerInstance, error)
        // Platform returns target sandbox OS that will be used by Controller.
        // containerd will rely on this to generate proper OCI spec.
        Platform(_ctx context.Context, _sandboxID string) (platforms.Platform, error)
        // Stop will stop sandbox instance
        Stop(ctx context.Context, sandboxID string, opts ...StopOpt) error
        // Wait blocks until sandbox process exits.
        Wait(ctx context.Context, sandboxID string) (ExitStatus, error)
        // Status will query sandbox process status. It is heavier than Ping call and must be used whenever you need to
        // gather metadata about current sandbox state (status, uptime, resource use, etc).
        Status(ctx context.Context, sandboxID string, verbose bool) (ControllerStatus, error)
        // Shutdown deletes and cleans all tasks and sandbox instance.
        Shutdown(ctx context.Context, sandboxID string) error
        // Metrics queries the sandbox for metrics.
        Metrics(ctx context.Context, sandboxID string) (*types.Metric, error)
}

type ControllerInstance struct {
        SandboxID string
        Pid       uint32
        CreatedAt time.Time
        Address   string
        Version   uint32
        Labels    map[string]string
}

type ExitStatus struct {
        ExitStatus uint32
        ExitedAt   time.Time
}

type ControllerStatus struct {
        SandboxID string
        Pid       uint32
        State     string
        Info      map[string]string
        CreatedAt time.Time
        ExitedAt  time.Time
        Extra     typeurl.Any
        Address   string
        Version   uint32
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        gogo_types "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/typeurl/v2"
)

// ToProto will map Sandbox struct to it's protobuf definition
func ToProto(sandbox *Sandbox) *types.Sandbox {
        extensions := make(map[string]*gogo_types.Any)
        for k, v := range sandbox.Extensions {
                extensions[k] = protobuf.FromAny(v)
        }
        return &types.Sandbox{
                SandboxID: sandbox.ID,
                Runtime: &types.Sandbox_Runtime{
                        Name:    sandbox.Runtime.Name,
                        Options: protobuf.FromAny(sandbox.Runtime.Options),
                },
                Sandboxer:  sandbox.Sandboxer,
                Labels:     sandbox.Labels,
                CreatedAt:  protobuf.ToTimestamp(sandbox.CreatedAt),
                UpdatedAt:  protobuf.ToTimestamp(sandbox.UpdatedAt),
                Extensions: extensions,
                Spec:       protobuf.FromAny(sandbox.Spec),
        }
}

// FromProto map protobuf sandbox definition to Sandbox struct
func FromProto(sandboxpb *types.Sandbox) Sandbox {
        runtime := RuntimeOpts{
                Name:    sandboxpb.Runtime.Name,
                Options: sandboxpb.Runtime.Options,
        }

        extensions := make(map[string]typeurl.Any)
        for k, v := range sandboxpb.Extensions {
                v := v
                extensions[k] = v
        }

        return Sandbox{
                ID:         sandboxpb.SandboxID,
                Labels:     sandboxpb.Labels,
                Runtime:    runtime,
                Spec:       sandboxpb.Spec,
                Sandboxer:  sandboxpb.Sandboxer,
                CreatedAt:  protobuf.FromTimestamp(sandboxpb.CreatedAt),
                UpdatedAt:  protobuf.FromTimestamp(sandboxpb.UpdatedAt),
                Extensions: extensions,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"

        api "github.com/containerd/containerd/api/services/sandbox/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        "google.golang.org/protobuf/types/known/anypb"
)

// remoteSandboxController is a low level GRPC client for containerd's sandbox controller service
type remoteSandboxController struct {
        client api.ControllerClient
}

var _ sandbox.Controller = (*remoteSandboxController)(nil)

// NewSandboxController creates a client for a sandbox controller
func NewSandboxController(client api.ControllerClient) sandbox.Controller {
        return &remoteSandboxController{client: client}
}

func (s *remoteSandboxController) Create(ctx context.Context, sandboxInfo sandbox.Sandbox, opts ...sandbox.CreateOpt) error {
        var options sandbox.CreateOptions
        for _, opt := range opts {
                opt(&options)
        }
        _, err := s.client.Create(ctx, &api.ControllerCreateRequest{
                SandboxID: sandboxInfo.ID,
                Rootfs:    mount.ToProto(options.Rootfs),
                Options: &anypb.Any{
                        TypeUrl: options.Options.GetTypeUrl(),
                        Value:   options.Options.GetValue(),
                },
                NetnsPath:   options.NetNSPath,
                Annotations: options.Annotations,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

func (s *remoteSandboxController) Start(ctx context.Context, sandboxID string) (sandbox.ControllerInstance, error) {
        resp, err := s.client.Start(ctx, &api.ControllerStartRequest{SandboxID: sandboxID})
        if err != nil {
                return sandbox.ControllerInstance{}, errdefs.FromGRPC(err)
        }

        return sandbox.ControllerInstance{
                SandboxID: sandboxID,
                Pid:       resp.GetPid(),
                CreatedAt: resp.GetCreatedAt().AsTime(),
                Labels:    resp.GetLabels(),
                Address:   resp.GetAddress(),
                Version:   resp.GetVersion(),
        }, nil
}

func (s *remoteSandboxController) Platform(ctx context.Context, sandboxID string) (platforms.Platform, error) {
        resp, err := s.client.Platform(ctx, &api.ControllerPlatformRequest{SandboxID: sandboxID})
        if err != nil {
                return platforms.Platform{}, errdefs.FromGRPC(err)
        }

        platform := resp.GetPlatform()
        return platforms.Platform{
                Architecture: platform.GetArchitecture(),
                OS:           platform.GetOS(),
                Variant:      platform.GetVariant(),
        }, nil
}

func (s *remoteSandboxController) Stop(ctx context.Context, sandboxID string, opts ...sandbox.StopOpt) error {
        var soptions sandbox.StopOptions
        for _, opt := range opts {
                opt(&soptions)
        }
        req := &api.ControllerStopRequest{SandboxID: sandboxID}
        if soptions.Timeout != nil {
                req.TimeoutSecs = uint32(soptions.Timeout.Seconds())
        }
        _, err := s.client.Stop(ctx, req)
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

func (s *remoteSandboxController) Shutdown(ctx context.Context, sandboxID string) error {
        _, err := s.client.Shutdown(ctx, &api.ControllerShutdownRequest{SandboxID: sandboxID})
        if err != nil {
                return errdefs.FromGRPC(err)
        }

        return nil
}

func (s *remoteSandboxController) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) {
        resp, err := s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID})
        if err != nil {
                return sandbox.ExitStatus{}, errdefs.FromGRPC(err)
        }

        return sandbox.ExitStatus{
                ExitStatus: resp.GetExitStatus(),
                ExitedAt:   resp.GetExitedAt().AsTime(),
        }, nil
}

func (s *remoteSandboxController) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
        resp, err := s.client.Status(ctx, &api.ControllerStatusRequest{SandboxID: sandboxID, Verbose: verbose})
        if err != nil {
                return sandbox.ControllerStatus{}, errdefs.FromGRPC(err)
        }
        return sandbox.ControllerStatus{
                SandboxID: sandboxID,
                Pid:       resp.GetPid(),
                State:     resp.GetState(),
                Info:      resp.GetInfo(),
                CreatedAt: resp.GetCreatedAt().AsTime(),
                ExitedAt:  resp.GetExitedAt().AsTime(),
                Extra:     resp.GetExtra(),
                Address:   resp.GetAddress(),
                Version:   resp.GetVersion(),
        }, nil
}

func (s *remoteSandboxController) Metrics(ctx context.Context, sandboxID string) (*types.Metric, error) {
        resp, err := s.client.Metrics(ctx, &api.ControllerMetricsRequest{SandboxID: sandboxID})
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return resp.Metrics, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"

        api "github.com/containerd/containerd/api/services/sandbox/v1"
        sb "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/errdefs"
)

// remoteSandboxStore is a low-level containerd client to manage sandbox environments metadata
type remoteSandboxStore struct {
        client api.StoreClient
}

var _ sb.Store = (*remoteSandboxStore)(nil)

// NewSandboxStore create a client for a sandbox store
func NewSandboxStore(client api.StoreClient) sb.Store {
        return &remoteSandboxStore{client: client}
}

func (s *remoteSandboxStore) Create(ctx context.Context, sandbox sb.Sandbox) (sb.Sandbox, error) {
        resp, err := s.client.Create(ctx, &api.StoreCreateRequest{
                Sandbox: sb.ToProto(&sandbox),
        })
        if err != nil {
                return sb.Sandbox{}, errdefs.FromGRPC(err)
        }

        return sb.FromProto(resp.Sandbox), nil
}

func (s *remoteSandboxStore) Update(ctx context.Context, sandbox sb.Sandbox, fieldpaths ...string) (sb.Sandbox, error) {
        resp, err := s.client.Update(ctx, &api.StoreUpdateRequest{
                Sandbox: sb.ToProto(&sandbox),
                Fields:  fieldpaths,
        })
        if err != nil {
                return sb.Sandbox{}, errdefs.FromGRPC(err)
        }

        return sb.FromProto(resp.Sandbox), nil
}

func (s *remoteSandboxStore) Get(ctx context.Context, id string) (sb.Sandbox, error) {
        resp, err := s.client.Get(ctx, &api.StoreGetRequest{
                SandboxID: id,
        })
        if err != nil {
                return sb.Sandbox{}, errdefs.FromGRPC(err)
        }

        return sb.FromProto(resp.Sandbox), nil
}

func (s *remoteSandboxStore) List(ctx context.Context, filters ...string) ([]sb.Sandbox, error) {
        resp, err := s.client.List(ctx, &api.StoreListRequest{
                Filters: filters,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }

        out := make([]sb.Sandbox, len(resp.List))
        for i := range resp.List {
                out[i] = sb.FromProto(resp.List[i])
        }

        return out, nil
}

func (s *remoteSandboxStore) Delete(ctx context.Context, id string) error {
        _, err := s.client.Delete(ctx, &api.StoreDeleteRequest{
                SandboxID: id,
        })

        return errdefs.FromGRPC(err)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
)

// Sandbox is an object stored in metadata database
type Sandbox struct {
        // ID uniquely identifies the sandbox in a namespace
        ID string
        // Labels provide metadata extension for a sandbox
        Labels map[string]string
        // Runtime shim to use for this sandbox
        Runtime RuntimeOpts
        // Spec carries the runtime specification used to implement the sandbox
        Spec typeurl.Any
        // Sandboxer is the sandbox controller who manages the sandbox
        Sandboxer string
        // CreatedAt is the time at which the sandbox was created
        CreatedAt time.Time
        // UpdatedAt is the time at which the sandbox was updated
        UpdatedAt time.Time
        // Extensions stores client-specified metadata
        Extensions map[string]typeurl.Any
}

// RuntimeOpts holds runtime specific information
type RuntimeOpts struct {
        Name    string
        Options typeurl.Any
}

// Store is a storage interface for sandbox metadata objects
type Store interface {
        // Create a sandbox record in the store
        Create(ctx context.Context, sandbox Sandbox) (Sandbox, error)

        // Update the sandbox with the provided sandbox object and fields
        Update(ctx context.Context, sandbox Sandbox, fieldpaths ...string) (Sandbox, error)

        // Get sandbox metadata using the id
        Get(ctx context.Context, id string) (Sandbox, error)

        // List returns sandboxes that match one or more of the provided filters
        List(ctx context.Context, filters ...string) ([]Sandbox, error)

        // Delete a sandbox from metadata store using the id
        Delete(ctx context.Context, id string) error
}

// AddExtension is a helper function to add sandbox metadata extension.
func (s *Sandbox) AddExtension(name string, obj interface{}) error {
        if s.Extensions == nil {
                s.Extensions = map[string]typeurl.Any{}
        }

        out, err := typeurl.MarshalAny(obj)
        if err != nil {
                return fmt.Errorf("failed to marshal sandbox extension %q: %w", name, err)
        }

        s.Extensions[name] = out
        return nil
}

// AddLabel adds a label to sandbox's labels.
func (s *Sandbox) AddLabel(name string, value string) {
        if s.Labels == nil {
                s.Labels = map[string]string{}
        }

        s.Labels[name] = value
}

// GetExtension retrieves a sandbox extension by name.
func (s *Sandbox) GetExtension(name string, obj interface{}) error {
        out, ok := s.Extensions[name]
        if !ok {
                return errdefs.ErrNotFound
        }

        if err := typeurl.UnmarshalTo(out, obj); err != nil {
                return fmt.Errorf("failed to unmarshal sandbox extension %q: %w", name, err)
        }

        return nil
}

// GetLabel retrieves a sandbox label by name.
func (s *Sandbox) GetLabel(name string) (string, error) {
        out, ok := s.Labels[name]
        if !ok {
                return "", fmt.Errorf("unable to find label %q in sandbox metadata: %w", name, errdefs.ErrNotFound)
        }

        return out, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"
        "io"

        snapshotsapi "github.com/containerd/containerd/api/services/snapshots/v1"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        protobuftypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
)

// NewSnapshotter returns a new Snapshotter which communicates over a GRPC
// connection using the containerd snapshot GRPC API.
func NewSnapshotter(client snapshotsapi.SnapshotsClient, snapshotterName string) snapshots.Snapshotter {
        return &proxySnapshotter{
                client:          client,
                snapshotterName: snapshotterName,
        }
}

type proxySnapshotter struct {
        client          snapshotsapi.SnapshotsClient
        snapshotterName string
}

func (p *proxySnapshotter) Stat(ctx context.Context, key string) (snapshots.Info, error) {
        resp, err := p.client.Stat(ctx,
                &snapshotsapi.StatSnapshotRequest{
                        Snapshotter: p.snapshotterName,
                        Key:         key,
                })
        if err != nil {
                return snapshots.Info{}, errdefs.FromGRPC(err)
        }
        return snapshots.InfoFromProto(resp.Info), nil
}

func (p *proxySnapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (snapshots.Info, error) {
        resp, err := p.client.Update(ctx,
                &snapshotsapi.UpdateSnapshotRequest{
                        Snapshotter: p.snapshotterName,
                        Info:        snapshots.InfoToProto(info),
                        UpdateMask: &protobuftypes.FieldMask{
                                Paths: fieldpaths,
                        },
                })
        if err != nil {
                return snapshots.Info{}, errdefs.FromGRPC(err)
        }
        return snapshots.InfoFromProto(resp.Info), nil
}

func (p *proxySnapshotter) Usage(ctx context.Context, key string) (snapshots.Usage, error) {
        resp, err := p.client.Usage(ctx, &snapshotsapi.UsageRequest{
                Snapshotter: p.snapshotterName,
                Key:         key,
        })
        if err != nil {
                return snapshots.Usage{}, errdefs.FromGRPC(err)
        }
        return snapshots.UsageFromProto(resp), nil
}

func (p *proxySnapshotter) Mounts(ctx context.Context, key string) ([]mount.Mount, error) {
        resp, err := p.client.Mounts(ctx, &snapshotsapi.MountsRequest{
                Snapshotter: p.snapshotterName,
                Key:         key,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return mount.FromProto(resp.Mounts), nil
}

func (p *proxySnapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        var local snapshots.Info
        for _, opt := range opts {
                if err := opt(&local); err != nil {
                        return nil, err
                }
        }
        resp, err := p.client.Prepare(ctx, &snapshotsapi.PrepareSnapshotRequest{
                Snapshotter: p.snapshotterName,
                Key:         key,
                Parent:      parent,
                Labels:      local.Labels,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return mount.FromProto(resp.Mounts), nil
}

func (p *proxySnapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        var local snapshots.Info
        for _, opt := range opts {
                if err := opt(&local); err != nil {
                        return nil, err
                }
        }
        resp, err := p.client.View(ctx, &snapshotsapi.ViewSnapshotRequest{
                Snapshotter: p.snapshotterName,
                Key:         key,
                Parent:      parent,
                Labels:      local.Labels,
        })
        if err != nil {
                return nil, errdefs.FromGRPC(err)
        }
        return mount.FromProto(resp.Mounts), nil
}

func (p *proxySnapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
        var local snapshots.Info
        for _, opt := range opts {
                if err := opt(&local); err != nil {
                        return err
                }
        }
        _, err := p.client.Commit(ctx, &snapshotsapi.CommitSnapshotRequest{
                Snapshotter: p.snapshotterName,
                Name:        name,
                Key:         key,
                Labels:      local.Labels,
        })
        return errdefs.FromGRPC(err)
}

func (p *proxySnapshotter) Remove(ctx context.Context, key string) error {
        _, err := p.client.Remove(ctx, &snapshotsapi.RemoveSnapshotRequest{
                Snapshotter: p.snapshotterName,
                Key:         key,
        })
        return errdefs.FromGRPC(err)
}

func (p *proxySnapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
        sc, err := p.client.List(ctx, &snapshotsapi.ListSnapshotsRequest{
                Snapshotter: p.snapshotterName,
                Filters:     fs,
        })
        if err != nil {
                return errdefs.FromGRPC(err)
        }
        for {
                resp, err := sc.Recv()
                if err != nil {
                        if err == io.EOF {
                                return nil
                        }
                        return errdefs.FromGRPC(err)
                }
                if resp == nil {
                        return nil
                }
                for _, info := range resp.Info {
                        if err := fn(ctx, snapshots.InfoFromProto(info)); err != nil {
                                return err
                        }
                }
        }
}

func (p *proxySnapshotter) Close() error {
        return nil
}

func (p *proxySnapshotter) Cleanup(ctx context.Context) error {
        _, err := p.client.Cleanup(ctx, &snapshotsapi.CleanupRequest{
                Snapshotter: p.snapshotterName,
        })
        return errdefs.FromGRPC(err)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package snapshots

import (
        "context"
        "encoding/json"
        "strings"
        "time"

        snapshotsapi "github.com/containerd/containerd/api/services/snapshots/v1"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/protobuf"
)

const (
        // UnpackKeyPrefix is the beginning of the key format used for snapshots that will have
        // image content unpacked into them.
        UnpackKeyPrefix = "extract"
        // UnpackKeyFormat is the format for the snapshotter keys used for extraction
        UnpackKeyFormat       = UnpackKeyPrefix + "-%s %s"
        inheritedLabelsPrefix = "containerd.io/snapshot/"
        labelSnapshotRef      = "containerd.io/snapshot.ref"

        // LabelSnapshotUIDMapping is the label used for UID mappings
        LabelSnapshotUIDMapping = "containerd.io/snapshot/uidmapping"
        // LabelSnapshotGIDMapping is the label used for GID mappings
        LabelSnapshotGIDMapping = "containerd.io/snapshot/gidmapping"
)

// Kind identifies the kind of snapshot.
type Kind uint8

// definitions of snapshot kinds
const (
        KindUnknown Kind = iota
        KindView
        KindActive
        KindCommitted
)

// ParseKind parses the provided string into a Kind
//
// If the string cannot be parsed KindUnknown is returned
func ParseKind(s string) Kind {
        s = strings.ToLower(s)
        switch s {
        case "view":
                return KindView
        case "active":
                return KindActive
        case "committed":
                return KindCommitted
        default:
                return KindUnknown
        }
}

// String returns the string representation of the Kind
func (k Kind) String() string {
        switch k {
        case KindView:
                return "View"
        case KindActive:
                return "Active"
        case KindCommitted:
                return "Committed"
        default:
                return "Unknown"
        }
}

// MarshalJSON the Kind to JSON
func (k Kind) MarshalJSON() ([]byte, error) {
        return json.Marshal(k.String())
}

// UnmarshalJSON the Kind from JSON
func (k *Kind) UnmarshalJSON(b []byte) error {
        var s string
        if err := json.Unmarshal(b, &s); err != nil {
                return err
        }

        *k = ParseKind(s)
        return nil
}

// KindToProto converts from [Kind] to the protobuf definition [snapshots.Kind].
func KindToProto(kind Kind) snapshotsapi.Kind {
        switch kind {
        case KindActive:
                return snapshotsapi.Kind_ACTIVE
        case KindView:
                return snapshotsapi.Kind_VIEW
        default:
                return snapshotsapi.Kind_COMMITTED
        }
}

// KindFromProto converts from the protobuf definition [snapshots.Kind] to
// [Kind].
func KindFromProto(kind snapshotsapi.Kind) Kind {
        switch kind {
        case snapshotsapi.Kind_ACTIVE:
                return KindActive
        case snapshotsapi.Kind_VIEW:
                return KindView
        default:
                return KindCommitted
        }
}

// Info provides information about a particular snapshot.
// JSON marshalling is supported for interacting with tools like ctr,
type Info struct {
        Kind   Kind   // active or committed snapshot
        Name   string // name or key of snapshot
        Parent string `json:",omitempty"` // name of parent snapshot

        // Labels for a snapshot.
        //
        // Note: only labels prefixed with `containerd.io/snapshot/` will be inherited
        // by the snapshotter's `Prepare`, `View`, or `Commit` calls.
        Labels  map[string]string `json:",omitempty"`
        Created time.Time         `json:",omitempty"` // Created time
        Updated time.Time         `json:",omitempty"` // Last update time
}

// InfoToProto converts from [Info] to the protobuf definition [snapshots.Info].
func InfoToProto(info Info) *snapshotsapi.Info {
        return &snapshotsapi.Info{
                Name:      info.Name,
                Parent:    info.Parent,
                Kind:      KindToProto(info.Kind),
                CreatedAt: protobuf.ToTimestamp(info.Created),
                UpdatedAt: protobuf.ToTimestamp(info.Updated),
                Labels:    info.Labels,
        }
}

// InfoFromProto converts from the protobuf definition [snapshots.Info] to
// [Info].
func InfoFromProto(info *snapshotsapi.Info) Info {
        return Info{
                Name:    info.Name,
                Parent:  info.Parent,
                Kind:    KindFromProto(info.Kind),
                Created: protobuf.FromTimestamp(info.CreatedAt),
                Updated: protobuf.FromTimestamp(info.UpdatedAt),
                Labels:  info.Labels,
        }
}

// Usage defines statistics for disk resources consumed by the snapshot.
//
// These resources only include the resources consumed by the snapshot itself
// and does not include resources usage by the parent.
type Usage struct {
        Inodes int64 // number of inodes in use.
        Size   int64 // provides usage, in bytes, of snapshot
}

// Add the provided usage to the current usage
func (u *Usage) Add(other Usage) {
        u.Size += other.Size

        // TODO(stevvooe): assumes independent inodes, but provides an upper
        // bound. This should be pretty close, assuming the inodes for a
        // snapshot are roughly unique to it. Don't trust this assumption.
        u.Inodes += other.Inodes
}

// UsageFromProto converts from the protobuf definition [snapshots.Usage] to
// [Usage].
func UsageFromProto(resp *snapshotsapi.UsageResponse) Usage {
        return Usage{
                Inodes: resp.Inodes,
                Size:   resp.Size,
        }
}

// UsageToProto converts from [Usage] to the protobuf definition [snapshots.Usage].
func UsageToProto(usage Usage) *snapshotsapi.UsageResponse {
        return &snapshotsapi.UsageResponse{
                Inodes: usage.Inodes,
                Size:   usage.Size,
        }
}

// WalkFunc defines the callback for a snapshot walk.
type WalkFunc func(context.Context, Info) error

// Snapshotter defines the methods required to implement a snapshot snapshotter for
// allocating, snapshotting and mounting filesystem changesets. The model works
// by building up sets of changes with parent-child relationships.
//
// A snapshot represents a filesystem state. Every snapshot has a parent, where
// the empty parent is represented by the empty string. A diff can be taken
// between a parent and its snapshot to generate a classic layer.
//
// An active snapshot is created by calling `Prepare`. After mounting, changes
// can be made to the snapshot. The act of committing creates a committed
// snapshot. The committed snapshot will get the parent of active snapshot. The
// committed snapshot can then be used as a parent. Active snapshots can never
// act as a parent.
//
// Snapshots are best understood by their lifecycle. Active snapshots are
// always created with Prepare or View. Committed snapshots are always created
// with Commit.  Active snapshots never become committed snapshots and vice
// versa. All snapshots may be removed.
//
// For consistency, we define the following terms to be used throughout this
// interface for snapshotter implementations:
//
//        `ctx` - refers to a context.Context
//        `key` - refers to an active snapshot
//        `name` - refers to a committed snapshot
//        `parent` - refers to the parent in relation
//
// Most methods take various combinations of these identifiers. Typically,
// `name` and `parent` will be used in cases where a method *only* takes
// committed snapshots. `key` will be used to refer to active snapshots in most
// cases, except where noted. All variables used to access snapshots use the
// same key space. For example, an active snapshot may not share the same key
// with a committed snapshot.
//
// We cover several examples below to demonstrate the utility of the snapshotter.
//
// # Importing a Layer
//
// To import a layer, we simply have the snapshotter provide a list of
// mounts to be applied such that our dst will capture a changeset. We start
// out by getting a path to the layer tar file and creating a temp location to
// unpack it to:
//
//        layerPath, tmpDir := getLayerPath(), mkTmpDir() // just a path to layer tar file.
//
// We start by using the snapshotter to Prepare a new snapshot transaction, using a
// key and descending from the empty parent "". To prevent our layer from being
// garbage collected during unpacking, we add the `containerd.io/gc.root` label:
//
//        noGcOpt := snapshots.WithLabels(map[string]string{
//                "containerd.io/gc.root": time.Now().UTC().Format(time.RFC3339),
//        })
//        mounts, err := snapshotter.Prepare(ctx, key, "", noGcOpt)
//        if err != nil { ... }
//
// We get back a list of mounts from snapshotter.Prepare(), with the key identifying
// the active snapshot. Mount this to the temporary location with the
// following:
//
//        if err := mount.All(mounts, tmpDir); err != nil { ... }
//
// Once the mounts are performed, our temporary location is ready to capture
// a diff. In practice, this works similar to a filesystem transaction. The
// next step is to unpack the layer. We have a special function unpackLayer
// that applies the contents of the layer to target location and calculates the
// DiffID of the unpacked layer (this is a requirement for docker
// implementation):
//
//        layer, err := os.Open(layerPath)
//        if err != nil { ... }
//        digest, err := unpackLayer(tmpLocation, layer) // unpack into layer location
//        if err != nil { ... }
//
// When the above completes, we should have a filesystem that represents the
// contents of the layer. Careful implementations should verify that digest
// matches the expected DiffID. When completed, we unmount the mounts:
//
//        unmount(mounts) // optional, for now
//
// Now that we've verified and unpacked our layer, we commit the active
// snapshot to a name. For this example, we are just going to use the layer
// digest, but in practice, this will probably be the ChainID. This also removes
// the active snapshot:
//
//        if err := snapshotter.Commit(ctx, digest.String(), key, noGcOpt); err != nil { ... }
//
// Now, we have a layer in the snapshotter that can be accessed with the digest
// provided during commit.
//
// # Importing the Next Layer
//
// Making a layer depend on the above is identical to the process described
// above except that the parent is provided as parent when calling
// snapshotter.Prepare(), assuming a clean, unique key identifier:
//
//        mounts, err := snapshotter.Prepare(ctx, key, parentDigest, noGcOpt)
//
// We then mount, apply and commit, as we did above. The new snapshot will be
// based on the content of the previous one.
//
// # Running a Container
//
// To run a container, we simply provide snapshotter.Prepare() the committed image
// snapshot as the parent. After mounting, the prepared path can
// be used directly as the container's filesystem:
//
//        mounts, err := snapshotter.Prepare(ctx, containerKey, imageRootFSChainID)
//
// The returned mounts can then be passed directly to the container runtime. If
// one would like to create a new image from the filesystem, snapshotter.Commit() is
// called:
//
//        if err := snapshotter.Commit(ctx, newImageSnapshot, containerKey); err != nil { ... }
//
// Alternatively, for most container runs, snapshotter.Remove() will be called to
// signal the snapshotter to abandon the changes.
type Snapshotter interface {
        // Stat returns the info for an active or committed snapshot by name or
        // key.
        //
        // Should be used for parent resolution, existence checks and to discern
        // the kind of snapshot.
        Stat(ctx context.Context, key string) (Info, error)

        // Update updates the info for a snapshot.
        //
        // Only mutable properties of a snapshot may be updated.
        Update(ctx context.Context, info Info, fieldpaths ...string) (Info, error)

        // Usage returns the resource usage of an active or committed snapshot
        // excluding the usage of parent snapshots.
        //
        // The running time of this call for active snapshots is dependent on
        // implementation, but may be proportional to the size of the resource.
        // Callers should take this into consideration. Implementations should
        // attempt to honor context cancellation and avoid taking locks when making
        // the calculation.
        Usage(ctx context.Context, key string) (Usage, error)

        // Mounts returns the mounts for the active snapshot transaction identified
        // by key. Can be called on a read-write or readonly transaction. This is
        // available only for active snapshots.
        //
        // This can be used to recover mounts after calling View or Prepare.
        Mounts(ctx context.Context, key string) ([]mount.Mount, error)

        // Prepare creates an active snapshot identified by key descending from the
        // provided parent.  The returned mounts can be used to mount the snapshot
        // to capture changes.
        //
        // If a parent is provided, after performing the mounts, the destination
        // will start with the content of the parent. The parent must be a
        // committed snapshot. Changes to the mounted destination will be captured
        // in relation to the parent. The default parent, "", is an empty
        // directory.
        //
        // The changes may be saved to a committed snapshot by calling Commit. When
        // one is done with the transaction, Remove should be called on the key.
        //
        // Multiple calls to Prepare or View with the same key should fail.
        Prepare(ctx context.Context, key, parent string, opts ...Opt) ([]mount.Mount, error)

        // View behaves identically to Prepare except the result may not be
        // committed back to the snapshot snapshotter. View returns a readonly view on
        // the parent, with the active snapshot being tracked by the given key.
        //
        // This method operates identically to Prepare, except the mounts returned
        // may have the readonly flag set. Any modifications to the underlying
        // filesystem will be ignored. Implementations may perform this in a more
        // efficient manner that differs from what would be attempted with
        // `Prepare`.
        //
        // Commit may not be called on the provided key and will return an error.
        // To collect the resources associated with key, Remove must be called with
        // key as the argument.
        View(ctx context.Context, key, parent string, opts ...Opt) ([]mount.Mount, error)

        // Commit captures the changes between key and its parent into a snapshot
        // identified by name.  The name can then be used with the snapshotter's other
        // methods to create subsequent snapshots.
        //
        // A committed snapshot will be created under name with the parent of the
        // active snapshot.
        //
        // After commit, the snapshot identified by key is removed.
        Commit(ctx context.Context, name, key string, opts ...Opt) error

        // Remove the committed or active snapshot by the provided key.
        //
        // All resources associated with the key will be removed.
        //
        // If the snapshot is a parent of another snapshot, its children must be
        // removed before proceeding.
        Remove(ctx context.Context, key string) error

        // Walk will call the provided function for each snapshot in the
        // snapshotter which match the provided filters. If no filters are
        // given all items will be walked.
        // Filters:
        //  name
        //  parent
        //  kind (active,view,committed)
        //  labels.(label)
        Walk(ctx context.Context, fn WalkFunc, filters ...string) error

        // Close releases the internal resources.
        //
        // Close is expected to be called on the end of the lifecycle of the snapshotter,
        // but not mandatory.
        //
        // Close returns nil when it is already closed.
        Close() error
}

// Cleaner defines a type capable of performing asynchronous resource cleanup.
// The Cleaner interface should be used by snapshotters which implement fast
// removal and deferred resource cleanup. This prevents snapshots from needing
// to perform lengthy resource cleanup before acknowledging a snapshot key
// has been removed and available for re-use. This is also useful when
// performing multi-key removal with the intent of cleaning up all the
// resources after each snapshot key has been removed.
type Cleaner interface {
        Cleanup(ctx context.Context) error
}

// Opt allows setting mutable snapshot properties on creation
type Opt func(info *Info) error

// WithLabels appends labels to a created snapshot
func WithLabels(labels map[string]string) Opt {
        return func(info *Info) error {
                if info.Labels == nil {
                        info.Labels = make(map[string]string)
                }

                for k, v := range labels {
                        info.Labels[k] = v
                }

                return nil
        }
}

// FilterInheritedLabels filters the provided labels by removing any key which
// isn't a snapshot label. Snapshot labels have a prefix of "containerd.io/snapshot/"
// or are the "containerd.io/snapshot.ref" label.
func FilterInheritedLabels(labels map[string]string) map[string]string {
        if labels == nil {
                return nil
        }

        filtered := make(map[string]string)
        for k, v := range labels {
                if k == labelSnapshotRef || strings.HasPrefix(k, inheritedLabelsPrefix) {
                        filtered[k] = v
                }
        }
        return filtered
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package storage

import (
        "context"
        "encoding/binary"
        "errors"
        "fmt"
        "strconv"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/core/metadata/boltutil"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/errdefs"
        bolt "go.etcd.io/bbolt"
)

var (
        bucketKeyStorageVersion = []byte("v1")
        bucketKeySnapshot       = []byte("snapshots")
        bucketKeyParents        = []byte("parents")

        bucketKeyID     = []byte("id")
        bucketKeyParent = []byte("parent")
        bucketKeyKind   = []byte("kind")
        bucketKeyInodes = []byte("inodes")
        bucketKeySize   = []byte("size")

        // ErrNoTransaction is returned when an operation is attempted with
        // a context which is not inside of a transaction.
        ErrNoTransaction = errors.New("no transaction in context")
)

// parentKey returns a composite key of the parent and child identifiers. The
// parts of the key are separated by a zero byte.
func parentKey(parent, child uint64) []byte {
        b := make([]byte, binary.Size([]uint64{parent, child})+1)
        i := binary.PutUvarint(b, parent)
        j := binary.PutUvarint(b[i+1:], child)
        return b[0 : i+j+1]
}

// parentPrefixKey returns the parent part of the composite key with the
// zero byte separator.
func parentPrefixKey(parent uint64) []byte {
        b := make([]byte, binary.Size(parent)+1)
        i := binary.PutUvarint(b, parent)
        return b[0 : i+1]
}

// getParentPrefix returns the first part of the composite key which
// represents the parent identifier.
func getParentPrefix(b []byte) uint64 {
        parent, _ := binary.Uvarint(b)
        return parent
}

// GetInfo returns the snapshot Info directly from the metadata. Requires a
// context with a storage transaction.
func GetInfo(ctx context.Context, key string) (string, snapshots.Info, snapshots.Usage, error) {
        var (
                id uint64
                su snapshots.Usage
                si = snapshots.Info{
                        Name: key,
                }
        )
        err := withSnapshotBucket(ctx, key, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                getUsage(bkt, &su)
                return readSnapshot(bkt, &id, &si)
        })
        if err != nil {
                return "", snapshots.Info{}, snapshots.Usage{}, err
        }

        return strconv.FormatUint(id, 10), si, su, nil
}

// UpdateInfo updates an existing snapshot info's data
func UpdateInfo(ctx context.Context, info snapshots.Info, fieldpaths ...string) (snapshots.Info, error) {
        updated := snapshots.Info{
                Name: info.Name,
        }
        err := withBucket(ctx, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                sbkt := bkt.Bucket([]byte(info.Name))
                if sbkt == nil {
                        return fmt.Errorf("snapshot does not exist: %w", errdefs.ErrNotFound)
                }
                if err := readSnapshot(sbkt, nil, &updated); err != nil {
                        return err
                }

                if len(fieldpaths) > 0 {
                        for _, path := range fieldpaths {
                                if strings.HasPrefix(path, "labels.") {
                                        if updated.Labels == nil {
                                                updated.Labels = map[string]string{}
                                        }

                                        key := strings.TrimPrefix(path, "labels.")
                                        updated.Labels[key] = info.Labels[key]
                                        continue
                                }

                                switch path {
                                case "labels":
                                        updated.Labels = info.Labels
                                default:
                                        return fmt.Errorf("cannot update %q field on snapshot %q: %w", path, info.Name, errdefs.ErrInvalidArgument)
                                }
                        }
                } else {
                        // Set mutable fields
                        updated.Labels = info.Labels
                }
                updated.Updated = time.Now().UTC()
                if err := boltutil.WriteTimestamps(sbkt, updated.Created, updated.Updated); err != nil {
                        return err
                }

                return boltutil.WriteLabels(sbkt, updated.Labels)
        })
        if err != nil {
                return snapshots.Info{}, err
        }
        return updated, nil
}

// WalkInfo iterates through all metadata Info for the stored snapshots and
// calls the provided function for each. Requires a context with a storage
// transaction.
func WalkInfo(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return err
        }
        // TODO: allow indexes (name, parent, specific labels)
        return withBucket(ctx, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                return bkt.ForEach(func(k, v []byte) error {
                        // skip non buckets
                        if v != nil {
                                return nil
                        }
                        var (
                                sbkt = bkt.Bucket(k)
                                si   = snapshots.Info{
                                        Name: string(k),
                                }
                        )
                        if err := readSnapshot(sbkt, nil, &si); err != nil {
                                return err
                        }
                        if !filter.Match(adaptSnapshot(si)) {
                                return nil
                        }

                        return fn(ctx, si)
                })
        })
}

// GetSnapshot returns the metadata for the active or view snapshot transaction
// referenced by the given key. Requires a context with a storage transaction.
func GetSnapshot(ctx context.Context, key string) (s Snapshot, err error) {
        err = withBucket(ctx, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                sbkt := bkt.Bucket([]byte(key))
                if sbkt == nil {
                        return fmt.Errorf("snapshot does not exist: %w", errdefs.ErrNotFound)
                }

                s.ID = strconv.FormatUint(readID(sbkt), 10)
                s.Kind = readKind(sbkt)

                if s.Kind != snapshots.KindActive && s.Kind != snapshots.KindView {
                        return fmt.Errorf("requested snapshot %v not active or view: %w", key, errdefs.ErrFailedPrecondition)
                }

                if parentKey := sbkt.Get(bucketKeyParent); len(parentKey) > 0 {
                        spbkt := bkt.Bucket(parentKey)
                        if spbkt == nil {
                                return fmt.Errorf("parent does not exist: %w", errdefs.ErrNotFound)
                        }

                        s.ParentIDs, err = parents(bkt, spbkt, readID(spbkt))
                        if err != nil {
                                return fmt.Errorf("failed to get parent chain: %w", err)
                        }
                }
                return nil
        })
        if err != nil {
                return Snapshot{}, err
        }

        return
}

// CreateSnapshot inserts a record for an active or view snapshot with the provided parent.
func CreateSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts ...snapshots.Opt) (s Snapshot, err error) {
        switch kind {
        case snapshots.KindActive, snapshots.KindView:
        default:
                return Snapshot{}, fmt.Errorf("snapshot type %v invalid; only snapshots of type Active or View can be created: %w", kind, errdefs.ErrInvalidArgument)
        }
        var base snapshots.Info
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        return Snapshot{}, err
                }
        }

        err = createBucketIfNotExists(ctx, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                var (
                        spbkt *bolt.Bucket
                )
                if parent != "" {
                        spbkt = bkt.Bucket([]byte(parent))
                        if spbkt == nil {
                                return fmt.Errorf("missing parent %q bucket: %w", parent, errdefs.ErrNotFound)
                        }

                        if readKind(spbkt) != snapshots.KindCommitted {
                                return fmt.Errorf("parent %q is not committed snapshot: %w", parent, errdefs.ErrInvalidArgument)
                        }
                }
                sbkt, err := bkt.CreateBucket([]byte(key))
                if err != nil {
                        if err == bolt.ErrBucketExists {
                                err = fmt.Errorf("snapshot %v: %w", key, errdefs.ErrAlreadyExists)
                        }
                        return err
                }

                id, err := bkt.NextSequence()
                if err != nil {
                        return fmt.Errorf("unable to get identifier for snapshot %q: %w", key, err)
                }

                t := time.Now().UTC()
                si := snapshots.Info{
                        Parent:  parent,
                        Kind:    kind,
                        Labels:  base.Labels,
                        Created: t,
                        Updated: t,
                }
                if err := putSnapshot(sbkt, id, si); err != nil {
                        return err
                }

                if spbkt != nil {
                        pid := readID(spbkt)

                        // Store a backlink from the key to the parent. Store the snapshot name
                        // as the value to allow following the backlink to the snapshot value.
                        if err := pbkt.Put(parentKey(pid, id), []byte(key)); err != nil {
                                return fmt.Errorf("failed to write parent link for snapshot %q: %w", key, err)
                        }

                        s.ParentIDs, err = parents(bkt, spbkt, pid)
                        if err != nil {
                                return fmt.Errorf("failed to get parent chain for snapshot %q: %w", key, err)
                        }
                }

                s.ID = strconv.FormatUint(id, 10)
                s.Kind = kind
                return nil
        })
        if err != nil {
                return Snapshot{}, err
        }

        return
}

// Remove removes a snapshot from the metastore. The string identifier for the
// snapshot is returned as well as the kind. The provided context must contain a
// writable transaction.
func Remove(ctx context.Context, key string) (string, snapshots.Kind, error) {
        var (
                id uint64
                si snapshots.Info
        )

        if err := withBucket(ctx, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                sbkt := bkt.Bucket([]byte(key))
                if sbkt == nil {
                        return fmt.Errorf("snapshot %v: %w", key, errdefs.ErrNotFound)
                }

                if err := readSnapshot(sbkt, &id, &si); err != nil {
                        return fmt.Errorf("failed to read snapshot %s: %w", key, err)
                }

                if pbkt != nil {
                        k, _ := pbkt.Cursor().Seek(parentPrefixKey(id))
                        if getParentPrefix(k) == id {
                                return fmt.Errorf("cannot remove snapshot with child: %w", errdefs.ErrFailedPrecondition)
                        }

                        if si.Parent != "" {
                                spbkt := bkt.Bucket([]byte(si.Parent))
                                if spbkt == nil {
                                        return fmt.Errorf("snapshot %v: %w", key, errdefs.ErrNotFound)
                                }

                                if err := pbkt.Delete(parentKey(readID(spbkt), id)); err != nil {
                                        return fmt.Errorf("failed to delete parent link: %w", err)
                                }
                        }
                }

                if err := bkt.DeleteBucket([]byte(key)); err != nil {
                        return fmt.Errorf("failed to delete snapshot: %w", err)
                }

                return nil
        }); err != nil {
                return "", 0, err
        }

        return strconv.FormatUint(id, 10), si.Kind, nil
}

// CommitActive renames the active snapshot transaction referenced by `key`
// as a committed snapshot referenced by `Name`. The resulting snapshot  will be
// committed and readonly. The `key` reference will no longer be available for
// lookup or removal. The returned string identifier for the committed snapshot
// is the same identifier of the original active snapshot. The provided context
// must contain a writable transaction.
func CommitActive(ctx context.Context, key, name string, usage snapshots.Usage, opts ...snapshots.Opt) (string, error) {
        var (
                id   uint64
                base snapshots.Info
        )
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        return "", err
                }
        }

        if err := withBucket(ctx, func(ctx context.Context, bkt, pbkt *bolt.Bucket) error {
                dbkt, err := bkt.CreateBucket([]byte(name))
                if err != nil {
                        if err == bolt.ErrBucketExists {
                                err = errdefs.ErrAlreadyExists
                        }
                        return fmt.Errorf("committed snapshot %v: %w", name, err)
                }
                sbkt := bkt.Bucket([]byte(key))
                if sbkt == nil {
                        return fmt.Errorf("failed to get active snapshot %q: %w", key, errdefs.ErrNotFound)
                }

                var si snapshots.Info
                if err := readSnapshot(sbkt, &id, &si); err != nil {
                        return fmt.Errorf("failed to read active snapshot %q: %w", key, err)
                }

                if si.Kind != snapshots.KindActive {
                        return fmt.Errorf("snapshot %q is not active: %w", key, errdefs.ErrFailedPrecondition)
                }
                si.Kind = snapshots.KindCommitted
                si.Created = time.Now().UTC()
                si.Updated = si.Created

                // Replace labels, do not inherit
                si.Labels = base.Labels

                if err := putSnapshot(dbkt, id, si); err != nil {
                        return err
                }
                if err := putUsage(dbkt, usage); err != nil {
                        return err
                }
                if err := bkt.DeleteBucket([]byte(key)); err != nil {
                        return fmt.Errorf("failed to delete active snapshot %q: %w", key, err)
                }
                if si.Parent != "" {
                        spbkt := bkt.Bucket([]byte(si.Parent))
                        if spbkt == nil {
                                return fmt.Errorf("missing parent %q of snapshot %q: %w", si.Parent, key, errdefs.ErrNotFound)
                        }
                        pid := readID(spbkt)

                        // Updates parent back link to use new key
                        if err := pbkt.Put(parentKey(pid, id), []byte(name)); err != nil {
                                return fmt.Errorf("failed to update parent link %q from %q to %q: %w", pid, key, name, err)
                        }
                }

                return nil
        }); err != nil {
                return "", err
        }

        return strconv.FormatUint(id, 10), nil
}

// IDMap returns all the IDs mapped to their key
func IDMap(ctx context.Context) (map[string]string, error) {
        m := map[string]string{}
        if err := withBucket(ctx, func(ctx context.Context, bkt, _ *bolt.Bucket) error {
                return bkt.ForEach(func(k, v []byte) error {
                        // skip non buckets
                        if v != nil {
                                return nil
                        }
                        id := readID(bkt.Bucket(k))
                        m[strconv.FormatUint(id, 10)] = string(k)
                        return nil
                })
        }); err != nil {
                return nil, err
        }

        return m, nil
}

func withSnapshotBucket(ctx context.Context, key string, fn func(context.Context, *bolt.Bucket, *bolt.Bucket) error) error {
        tx, ok := ctx.Value(transactionKey{}).(*bolt.Tx)
        if !ok {
                return ErrNoTransaction
        }
        vbkt := tx.Bucket(bucketKeyStorageVersion)
        if vbkt == nil {
                return fmt.Errorf("bucket does not exist: %w", errdefs.ErrNotFound)
        }
        bkt := vbkt.Bucket(bucketKeySnapshot)
        if bkt == nil {
                return fmt.Errorf("snapshots bucket does not exist: %w", errdefs.ErrNotFound)
        }
        bkt = bkt.Bucket([]byte(key))
        if bkt == nil {
                return fmt.Errorf("snapshot does not exist: %w", errdefs.ErrNotFound)
        }

        return fn(ctx, bkt, vbkt.Bucket(bucketKeyParents))
}

func withBucket(ctx context.Context, fn func(context.Context, *bolt.Bucket, *bolt.Bucket) error) error {
        tx, ok := ctx.Value(transactionKey{}).(*bolt.Tx)
        if !ok {
                return ErrNoTransaction
        }
        bkt := tx.Bucket(bucketKeyStorageVersion)
        if bkt == nil {
                return fmt.Errorf("bucket does not exist: %w", errdefs.ErrNotFound)
        }
        return fn(ctx, bkt.Bucket(bucketKeySnapshot), bkt.Bucket(bucketKeyParents))
}

func createBucketIfNotExists(ctx context.Context, fn func(context.Context, *bolt.Bucket, *bolt.Bucket) error) error {
        tx, ok := ctx.Value(transactionKey{}).(*bolt.Tx)
        if !ok {
                return ErrNoTransaction
        }

        bkt, err := tx.CreateBucketIfNotExists(bucketKeyStorageVersion)
        if err != nil {
                return fmt.Errorf("failed to create version bucket: %w", err)
        }
        sbkt, err := bkt.CreateBucketIfNotExists(bucketKeySnapshot)
        if err != nil {
                return fmt.Errorf("failed to create snapshots bucket: %w", err)
        }
        pbkt, err := bkt.CreateBucketIfNotExists(bucketKeyParents)
        if err != nil {
                return fmt.Errorf("failed to create parents bucket: %w", err)
        }
        return fn(ctx, sbkt, pbkt)
}

func parents(bkt, pbkt *bolt.Bucket, parent uint64) (parents []string, err error) {
        for {
                parents = append(parents, strconv.FormatUint(parent, 10))

                parentKey := pbkt.Get(bucketKeyParent)
                if len(parentKey) == 0 {
                        return
                }
                pbkt = bkt.Bucket(parentKey)
                if pbkt == nil {
                        return nil, fmt.Errorf("missing parent: %w", errdefs.ErrNotFound)
                }

                parent = readID(pbkt)
        }
}

func readKind(bkt *bolt.Bucket) (k snapshots.Kind) {
        kind := bkt.Get(bucketKeyKind)
        if len(kind) == 1 {
                k = snapshots.Kind(kind[0])
        }
        return
}

func readID(bkt *bolt.Bucket) uint64 {
        id, _ := binary.Uvarint(bkt.Get(bucketKeyID))
        return id
}

func readSnapshot(bkt *bolt.Bucket, id *uint64, si *snapshots.Info) error {
        if id != nil {
                *id = readID(bkt)
        }
        if si != nil {
                si.Kind = readKind(bkt)
                si.Parent = string(bkt.Get(bucketKeyParent))

                if err := boltutil.ReadTimestamps(bkt, &si.Created, &si.Updated); err != nil {
                        return err
                }

                labels, err := boltutil.ReadLabels(bkt)
                if err != nil {
                        return err
                }
                si.Labels = labels
        }

        return nil
}

func putSnapshot(bkt *bolt.Bucket, id uint64, si snapshots.Info) error {
        idEncoded, err := encodeID(id)
        if err != nil {
                return err
        }

        updates := [][2][]byte{
                {bucketKeyID, idEncoded},
                {bucketKeyKind, []byte{byte(si.Kind)}},
        }
        if si.Parent != "" {
                updates = append(updates, [2][]byte{bucketKeyParent, []byte(si.Parent)})
        }
        for _, v := range updates {
                if err := bkt.Put(v[0], v[1]); err != nil {
                        return err
                }
        }
        if err := boltutil.WriteTimestamps(bkt, si.Created, si.Updated); err != nil {
                return err
        }
        return boltutil.WriteLabels(bkt, si.Labels)
}

func getUsage(bkt *bolt.Bucket, usage *snapshots.Usage) {
        usage.Inodes, _ = binary.Varint(bkt.Get(bucketKeyInodes))
        usage.Size, _ = binary.Varint(bkt.Get(bucketKeySize))
}

func putUsage(bkt *bolt.Bucket, usage snapshots.Usage) error {
        for _, v := range []struct {
                key   []byte
                value int64
        }{
                {bucketKeyInodes, usage.Inodes},
                {bucketKeySize, usage.Size},
        } {
                e, err := encodeSize(v.value)
                if err != nil {
                        return err
                }
                if err := bkt.Put(v.key, e); err != nil {
                        return err
                }
        }
        return nil
}

func encodeSize(size int64) ([]byte, error) {
        var (
                buf         [binary.MaxVarintLen64]byte
                sizeEncoded = buf[:]
        )
        sizeEncoded = sizeEncoded[:binary.PutVarint(sizeEncoded, size)]

        if len(sizeEncoded) == 0 {
                return nil, fmt.Errorf("failed encoding size = %v", size)
        }
        return sizeEncoded, nil
}

func encodeID(id uint64) ([]byte, error) {
        var (
                buf       [binary.MaxVarintLen64]byte
                idEncoded = buf[:]
        )
        idEncoded = idEncoded[:binary.PutUvarint(idEncoded, id)]

        if len(idEncoded) == 0 {
                return nil, fmt.Errorf("failed encoding id = %v", id)
        }
        return idEncoded, nil
}

func adaptSnapshot(info snapshots.Info) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "kind":
                        switch info.Kind {
                        case snapshots.KindActive:
                                return "active", true
                        case snapshots.KindView:
                                return "view", true
                        case snapshots.KindCommitted:
                                return "committed", true
                        }
                case "name":
                        return info.Name, true
                case "parent":
                        return info.Parent, true
                case "labels":
                        if len(info.Labels) == 0 {
                                return "", false
                        }

                        v, ok := info.Labels[strings.Join(fieldpath[1:], ".")]
                        return v, ok
                }

                return "", false
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package storage provides a metadata storage implementation for snapshot
// drivers. Drive implementations are responsible for starting and managing
// transactions using the defined context creator. This storage package uses
// BoltDB for storing metadata. Access to the raw boltdb transaction is not
// provided, but the stored object is provided by the proto subpackage.
package storage

import (
        "context"
        "errors"
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/log"
        bolt "go.etcd.io/bbolt"
)

// Transactor is used to finalize an active transaction.
type Transactor interface {
        // Commit commits any changes made during the transaction. On error a
        // caller is expected to clean up any resources which would have relied
        // on data mutated as part of this transaction. Only writable
        // transactions can commit, non-writable must call Rollback.
        Commit() error

        // Rollback rolls back any changes made during the transaction. This
        // must be called on all non-writable transactions and aborted writable
        // transaction.
        Rollback() error
}

// Snapshot hold the metadata for an active or view snapshot transaction. The
// ParentIDs hold the snapshot identifiers for the committed snapshots this
// active or view is based on. The ParentIDs are ordered from the lowest base
// to highest, meaning they should be applied in order from the first index to
// the last index. The last index should always be considered the active
// snapshots immediate parent.
type Snapshot struct {
        Kind      snapshots.Kind
        ID        string
        ParentIDs []string
}

// MetaStore is used to store metadata related to a snapshot driver. The
// MetaStore is intended to store metadata related to name, state and
// parentage. Using the MetaStore is not required to implement a snapshot
// driver but can be used to handle the persistence and transactional
// complexities of a driver implementation.
type MetaStore struct {
        dbfile string

        dbL sync.Mutex
        db  *bolt.DB
}

// NewMetaStore returns a snapshot MetaStore for storage of metadata related to
// a snapshot driver backed by a bolt file database. This implementation is
// strongly consistent and does all metadata changes in a transaction to prevent
// against process crashes causing inconsistent metadata state.
func NewMetaStore(dbfile string) (*MetaStore, error) {
        return &MetaStore{
                dbfile: dbfile,
        }, nil
}

type transactionKey struct{}

// TransactionContext creates a new transaction context. The writable value
// should be set to true for transactions which are expected to mutate data.
func (ms *MetaStore) TransactionContext(ctx context.Context, writable bool) (context.Context, Transactor, error) {
        ms.dbL.Lock()
        if ms.db == nil {
                db, err := bolt.Open(ms.dbfile, 0600, nil)
                if err != nil {
                        ms.dbL.Unlock()
                        return ctx, nil, fmt.Errorf("failed to open database file: %w", err)
                }
                ms.db = db
        }
        ms.dbL.Unlock()

        tx, err := ms.db.Begin(writable)
        if err != nil {
                return ctx, nil, fmt.Errorf("failed to start transaction: %w", err)
        }

        ctx = context.WithValue(ctx, transactionKey{}, tx)

        return ctx, tx, nil
}

// TransactionCallback represents a callback to be invoked while under a metastore transaction.
type TransactionCallback func(ctx context.Context) error

// WithTransaction is a convenience method to run a function `fn` while holding a meta store transaction.
// If the callback `fn` returns an error or the transaction is not writable, the database transaction will be discarded.
func (ms *MetaStore) WithTransaction(ctx context.Context, writable bool, fn TransactionCallback) error {
        ctx, trans, err := ms.TransactionContext(ctx, writable)
        if err != nil {
                return err
        }

        var result []error
        err = fn(ctx)
        if err != nil {
                result = append(result, err)
        }

        // Always rollback if transaction is not writable
        if err != nil || !writable {
                if terr := trans.Rollback(); terr != nil {
                        log.G(ctx).WithError(terr).Error("failed to rollback transaction")

                        result = append(result, fmt.Errorf("rollback failed: %w", terr))
                }
        } else {
                if terr := trans.Commit(); terr != nil {
                        log.G(ctx).WithError(terr).Error("failed to commit transaction")

                        result = append(result, fmt.Errorf("commit failed: %w", terr))
                }
        }

        if err := errors.Join(result...); err != nil {
                log.G(ctx).WithError(err).Debug("snapshotter error")
                return err
        }

        return nil
}

// Close closes the metastore and any underlying database connections
func (ms *MetaStore) Close() error {
        ms.dbL.Lock()
        defer ms.dbL.Unlock()
        if ms.db == nil {
                return nil
        }
        return ms.db.Close()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"
        "errors"
        "fmt"
        "io"

        streamingapi "github.com/containerd/containerd/api/services/streaming/v1"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/ttrpc"
        "github.com/containerd/typeurl/v2"
        "google.golang.org/grpc"
)

// NewStreamCreator returns a new stream creator which can communicate over a GRPC
// or TTRPC connection using the containerd streaming API.
func NewStreamCreator(client any) streaming.StreamCreator {
        switch c := client.(type) {
        case streamingapi.StreamingClient:
                return &streamCreator{
                        client: convertClient{c},
                }
        case grpc.ClientConnInterface:
                return &streamCreator{
                        client: convertClient{streamingapi.NewStreamingClient(c)},
                }
        case streamingapi.TTRPCStreamingClient:
                return &streamCreator{
                        client: c,
                }
        case *ttrpc.Client:
                return &streamCreator{
                        client: streamingapi.NewTTRPCStreamingClient(c),
                }
        case streaming.StreamCreator:
                return c
        default:
                panic(fmt.Errorf("unsupported stream client %T: %w", client, errdefs.ErrNotImplemented))
        }
}

type convertClient struct {
        streamingapi.StreamingClient
}

func (c convertClient) Stream(ctx context.Context) (streamingapi.TTRPCStreaming_StreamClient, error) {
        return c.StreamingClient.Stream(ctx)
}

type streamCreator struct {
        client streamingapi.TTRPCStreamingClient
}

func (sc *streamCreator) Create(ctx context.Context, id string) (streaming.Stream, error) {
        stream, err := sc.client.Stream(ctx)
        if err != nil {
                return nil, err
        }

        a, err := typeurl.MarshalAny(&streamingapi.StreamInit{
                ID: id,
        })
        if err != nil {
                return nil, err
        }
        err = stream.Send(protobuf.FromAny(a))
        if err != nil {
                if !errors.Is(err, io.EOF) {
                        err = errdefs.FromGRPC(err)
                }
                return nil, err
        }

        // Receive an ack that stream is init and ready
        if _, err = stream.Recv(); err != nil {
                if !errors.Is(err, io.EOF) {
                        err = errdefs.FromGRPC(err)
                }
                return nil, err
        }

        return &clientStream{
                s: stream,
        }, nil
}

type clientStream struct {
        s streamingapi.TTRPCStreaming_StreamClient
}

func (cs *clientStream) Send(a typeurl.Any) (err error) {
        err = cs.s.Send(protobuf.FromAny(a))
        if !errors.Is(err, io.EOF) {
                err = errdefs.FromGRPC(err)
        }
        return
}

func (cs *clientStream) Recv() (a typeurl.Any, err error) {
        a, err = cs.s.Recv()
        if !errors.Is(err, io.EOF) {
                err = errdefs.FromGRPC(err)
        }
        return
}

func (cs *clientStream) Close() error {
        return cs.s.CloseSend()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "context"
        "io"

        "github.com/containerd/typeurl/v2"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/api/types"
        transfertypes "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/images/archive"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/core/transfer/plugins"
        tstreaming "github.com/containerd/containerd/v2/core/transfer/streaming"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
)

func init() {
        // TODO: Move this to separate package?
        plugins.Register(&transfertypes.ImageExportStream{}, &ImageExportStream{})
        plugins.Register(&transfertypes.ImageImportStream{}, &ImageImportStream{})
}

type ExportOpt func(*ImageExportStream)

func WithPlatform(p v1.Platform) ExportOpt {
        return func(s *ImageExportStream) {
                s.platforms = append(s.platforms, p)
        }
}

func WithAllPlatforms(s *ImageExportStream) {
        s.allPlatforms = true
}

func WithSkipCompatibilityManifest(s *ImageExportStream) {
        s.skipCompatibilityManifest = true
}

func WithSkipNonDistributableBlobs(s *ImageExportStream) {
        s.skipNonDistributable = true
}

// NewImageExportStream returns an image exporter via tar stream
func NewImageExportStream(stream io.WriteCloser, mediaType string, opts ...ExportOpt) *ImageExportStream {
        s := &ImageExportStream{
                stream:    stream,
                mediaType: mediaType,
        }
        for _, opt := range opts {
                opt(s)
        }
        return s
}

type ImageExportStream struct {
        stream    io.WriteCloser
        mediaType string

        platforms                 []v1.Platform
        allPlatforms              bool
        skipCompatibilityManifest bool
        skipNonDistributable      bool
}

func (iis *ImageExportStream) ExportStream(context.Context) (io.WriteCloser, string, error) {
        return iis.stream, iis.mediaType, nil
}

func (iis *ImageExportStream) Export(ctx context.Context, cs content.Store, imgs []images.Image) error {
        opts := []archive.ExportOpt{
                archive.WithImages(imgs),
        }

        if len(iis.platforms) > 0 {
                opts = append(opts, archive.WithPlatform(platforms.Ordered(iis.platforms...)))
        } else {
                opts = append(opts, archive.WithPlatform(platforms.DefaultStrict()))
        }
        if iis.allPlatforms {
                opts = append(opts, archive.WithAllPlatforms())
        }
        if iis.skipCompatibilityManifest {
                opts = append(opts, archive.WithSkipDockerManifest())
        }
        if iis.skipNonDistributable {
                opts = append(opts, archive.WithSkipNonDistributableBlobs())
        }
        return archive.Export(ctx, cs, iis.stream, opts...)
}

func (iis *ImageExportStream) MarshalAny(ctx context.Context, sm streaming.StreamCreator) (typeurl.Any, error) {
        sid := tstreaming.GenerateID("export")
        stream, err := sm.Create(ctx, sid)
        if err != nil {
                return nil, err
        }

        // Receive stream and copy to writer
        go func() {
                if _, err := io.Copy(iis.stream, tstreaming.ReceiveStream(ctx, stream)); err != nil {
                        log.G(ctx).WithError(err).WithField("streamid", sid).Errorf("error copying stream")
                }
                iis.stream.Close()
        }()

        var specified []*types.Platform
        for _, p := range iis.platforms {
                specified = append(specified, &types.Platform{
                        OS:           p.OS,
                        Architecture: p.Architecture,
                        Variant:      p.Variant,
                })
        }
        s := &transfertypes.ImageExportStream{
                Stream:                    sid,
                MediaType:                 iis.mediaType,
                Platforms:                 specified,
                AllPlatforms:              iis.allPlatforms,
                SkipCompatibilityManifest: iis.skipCompatibilityManifest,
                SkipNonDistributable:      iis.skipNonDistributable,
        }

        return typeurl.MarshalAny(s)
}

func (iis *ImageExportStream) UnmarshalAny(ctx context.Context, sm streaming.StreamGetter, anyType typeurl.Any) error {
        var s transfertypes.ImageExportStream
        if err := typeurl.UnmarshalTo(anyType, &s); err != nil {
                return err
        }

        stream, err := sm.Get(ctx, s.Stream)
        if err != nil {
                log.G(ctx).WithError(err).WithField("stream", s.Stream).Debug("failed to get export stream")
                return err
        }

        specified := types.OCIPlatformFromProto(s.Platforms)
        iis.stream = tstreaming.WriteByteStream(ctx, stream)
        iis.mediaType = s.MediaType
        iis.platforms = specified
        iis.allPlatforms = s.AllPlatforms
        iis.skipCompatibilityManifest = s.SkipCompatibilityManifest
        iis.skipNonDistributable = s.SkipNonDistributable

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "context"
        "io"

        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        transferapi "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images/archive"
        "github.com/containerd/containerd/v2/core/streaming"
        tstreaming "github.com/containerd/containerd/v2/core/transfer/streaming"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
        "github.com/containerd/log"
)

type ImportOpt func(*ImageImportStream)

func WithForceCompression(s *ImageImportStream) {
        s.forceCompress = true
}

// NewImageImportStream returns a image importer via tar stream
func NewImageImportStream(stream io.Reader, mediaType string, opts ...ImportOpt) *ImageImportStream {
        s := &ImageImportStream{
                stream:    stream,
                mediaType: mediaType,
        }
        for _, opt := range opts {
                opt(s)
        }
        return s
}

type ImageImportStream struct {
        stream        io.Reader
        mediaType     string
        forceCompress bool
}

func (iis *ImageImportStream) ImportStream(context.Context) (io.Reader, string, error) {
        return iis.stream, iis.mediaType, nil
}

func (iis *ImageImportStream) Import(ctx context.Context, store content.Store) (ocispec.Descriptor, error) {
        var opts []archive.ImportOpt
        if iis.forceCompress {
                opts = append(opts, archive.WithImportCompression())
        }

        r := iis.stream
        if iis.mediaType == "" {
                d, err := compression.DecompressStream(iis.stream)
                if err != nil {
                        return ocispec.Descriptor{}, err
                }
                defer d.Close()
                r = d
        }

        return archive.ImportIndex(ctx, store, r, opts...)
}

func (iis *ImageImportStream) MarshalAny(ctx context.Context, sm streaming.StreamCreator) (typeurl.Any, error) {
        sid := tstreaming.GenerateID("import")
        stream, err := sm.Create(ctx, sid)
        if err != nil {
                return nil, err
        }
        tstreaming.SendStream(ctx, iis.stream, stream)

        s := &transferapi.ImageImportStream{
                Stream:        sid,
                MediaType:     iis.mediaType,
                ForceCompress: iis.forceCompress,
        }

        return typeurl.MarshalAny(s)
}

func (iis *ImageImportStream) UnmarshalAny(ctx context.Context, sm streaming.StreamGetter, anyType typeurl.Any) error {
        var s transferapi.ImageImportStream
        if err := typeurl.UnmarshalTo(anyType, &s); err != nil {
                return err
        }

        stream, err := sm.Get(ctx, s.Stream)
        if err != nil {
                log.G(ctx).WithError(err).WithField("stream", s.Stream).Debug("failed to get import stream")
                return err
        }

        iis.stream = tstreaming.ReceiveStream(ctx, stream)
        iis.mediaType = s.MediaType
        iis.forceCompress = s.ForceCompress

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package image

import (
        "context"
        "fmt"

        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/api/types"
        transfertypes "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/images/archive"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/containerd/v2/core/transfer/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
)

func init() {
        // TODO: Move this to separate package?
        plugins.Register(&transfertypes.ImageStore{}, &Store{}) // TODO: Rename ImageStoreDestination
}

type Store struct {
        imageName     string
        imageLabels   map[string]string
        platforms     []ocispec.Platform
        allMetadata   bool
        labelMap      func(ocispec.Descriptor) []string
        manifestLimit int

        // extraReferences are used to store or lookup multiple references
        extraReferences []Reference

        unpacks []transfer.UnpackConfiguration
}

// Reference is used to create or find a reference for an image
type Reference struct {
        Name string

        // IsPrefix determines whether the Name should be considered
        // a prefix (without tag or digest).
        // For lookup, this may allow matching multiple tags.
        // For store, this must have a tag or digest added.
        IsPrefix bool

        // AllowOverwrite allows overwriting or ignoring the name if
        // another reference is provided (such as through an annotation).
        // Only used if IsPrefix is true.
        AllowOverwrite bool

        // AddDigest adds the manifest digest to the reference.
        // For lookup, this allows matching tags with any digest.
        // For store, this allows adding the digest to the name.
        // Only used if IsPrefix is true.
        AddDigest bool

        // SkipNamedDigest only considers digest references which do not
        // have a non-digested named reference.
        // For lookup, this will deduplicate digest references when there is a named match.
        // For store, this only adds this digest reference when there is no matching full
        // name reference from the prefix.
        // Only used if IsPrefix is true.
        SkipNamedDigest bool
}

// StoreOpt defines options when configuring an image store source or destination
type StoreOpt func(*Store)

// WithImageLabels are the image labels to apply to a new image
func WithImageLabels(labels map[string]string) StoreOpt {
        return func(s *Store) {
                s.imageLabels = labels
        }
}

// WithPlatforms specifies which platforms to fetch content for
func WithPlatforms(p ...ocispec.Platform) StoreOpt {
        return func(s *Store) {
                s.platforms = append(s.platforms, p...)
        }
}

// WithManifestLimit defines the max number of manifests to fetch
func WithManifestLimit(limit int) StoreOpt {
        return func(s *Store) {
                s.manifestLimit = limit
        }
}

func WithAllMetadata(s *Store) {
        s.allMetadata = true
}

// WithNamedPrefix uses a named prefix to references images which only have a tag name
// reference in the annotation or check full references annotations against. Images
// with no reference resolved from matching annotations will not be stored.
// - name: image name prefix to append a tag to or check full name references with
// - allowOverwrite: allows the tag to be overwritten by full name reference inside
// the image which does not have name as the prefix
func WithNamedPrefix(name string, allowOverwrite bool) StoreOpt {
        ref := Reference{
                Name:           name,
                IsPrefix:       true,
                AllowOverwrite: allowOverwrite,
        }
        return func(s *Store) {
                s.extraReferences = append(s.extraReferences, ref)
        }
}

// WithDigestRef uses a named prefix to references images which only have a tag name
// reference in the annotation or check full references annotations against and
// additionally may add a digest reference. Images with no references resolved
// from matching annotations may be stored by digest.
// - name: image name prefix to append a tag to or check full name references with
// - allowOverwrite: allows the tag to be overwritten by full name reference inside
// the image which does not have name as the prefix
// - skipNamed: is set if no digest reference should be created if a named reference
// is successfully resolved from the annotations.
func WithDigestRef(name string, allowOverwrite bool, skipNamed bool) StoreOpt {
        ref := Reference{
                Name:            name,
                IsPrefix:        true,
                AllowOverwrite:  allowOverwrite,
                AddDigest:       true,
                SkipNamedDigest: skipNamed,
        }
        return func(s *Store) {
                s.extraReferences = append(s.extraReferences, ref)
        }
}

func WithExtraReference(name string) StoreOpt {
        ref := Reference{
                Name: name,
        }
        return func(s *Store) {
                s.extraReferences = append(s.extraReferences, ref)
        }
}

// WithUnpack specifies a platform to unpack for and an optional snapshotter to use
func WithUnpack(p ocispec.Platform, snapshotter string) StoreOpt {
        return func(s *Store) {
                s.unpacks = append(s.unpacks, transfer.UnpackConfiguration{
                        Platform:    p,
                        Snapshotter: snapshotter,
                })
        }
}

// NewStore creates a new image store source or Destination
func NewStore(image string, opts ...StoreOpt) *Store {
        s := &Store{
                imageName: image,
        }

        for _, opt := range opts {
                opt(s)
        }

        return s
}

func (is *Store) String() string {
        return fmt.Sprintf("Local Image Store (%s)", is.imageName)
}

func (is *Store) ImageFilter(h images.HandlerFunc, cs content.Store) images.HandlerFunc {
        var p platforms.MatchComparer
        if len(is.platforms) == 0 {
                p = platforms.All
        } else {
                p = platforms.Ordered(is.platforms...)
        }
        h = images.SetChildrenMappedLabels(cs, h, is.labelMap)
        if is.allMetadata {
                // Filter manifests by platforms but allow to handle manifest
                // and configuration for not-target platforms
                h = remotes.FilterManifestByPlatformHandler(h, p)
        } else {
                // Filter children by platforms if specified.
                h = images.FilterPlatforms(h, p)
        }

        // Sort and limit manifests if a finite number is needed
        if is.manifestLimit > 0 {
                h = images.LimitManifests(h, p, is.manifestLimit)
        }
        return h
}

func (is *Store) Store(ctx context.Context, desc ocispec.Descriptor, store images.Store) ([]images.Image, error) {
        var imgs []images.Image

        // If import ref type, store references from annotation or prefix
        if refSource, ok := desc.Annotations["io.containerd.import.ref-source"]; ok {
                switch refSource {
                case "annotation":
                        for _, ref := range is.extraReferences {
                                // Only use prefix references for annotation matching
                                if !ref.IsPrefix {
                                        continue
                                }

                                var nameT func(string) string
                                if ref.AllowOverwrite {
                                        nameT = archive.AddRefPrefix(ref.Name)
                                } else {
                                        nameT = archive.FilterRefPrefix(ref.Name)
                                }
                                name := imageName(desc.Annotations, nameT)

                                if name == "" {
                                        // If digested, add digest reference
                                        if ref.AddDigest {
                                                imgs = append(imgs, images.Image{
                                                        Name:   fmt.Sprintf("%s@%s", ref.Name, desc.Digest),
                                                        Target: desc,
                                                        Labels: is.imageLabels,
                                                })
                                        }
                                        continue
                                }

                                imgs = append(imgs, images.Image{
                                        Name:   name,
                                        Target: desc,
                                        Labels: is.imageLabels,
                                })

                                // If a named reference was found and SkipNamedDigest is true, do
                                // not use this reference
                                if ref.AddDigest && !ref.SkipNamedDigest {
                                        imgs = append(imgs, images.Image{
                                                Name:   fmt.Sprintf("%s@%s", ref.Name, desc.Digest),
                                                Target: desc,
                                                Labels: is.imageLabels,
                                        })
                                }
                        }
                default:
                        return nil, fmt.Errorf("ref source not supported: %w", errdefs.ErrInvalidArgument)
                }
                delete(desc.Annotations, "io.containerd.import.ref-source")
        } else {
                if is.imageName != "" {
                        imgs = append(imgs, images.Image{
                                Name:   is.imageName,
                                Target: desc,
                                Labels: is.imageLabels,
                        })
                }

                // If extra references, store all complete references (skip prefixes)
                for _, ref := range is.extraReferences {
                        if ref.IsPrefix {
                                continue
                        }
                        name := ref.Name
                        if ref.AddDigest {
                                name = fmt.Sprintf("%s@%s", name, desc.Digest)
                        }
                        imgs = append(imgs, images.Image{
                                Name:   name,
                                Target: desc,
                                Labels: is.imageLabels,
                        })
                }
        }

        if len(imgs) == 0 {
                return nil, fmt.Errorf("no image name found: %w", errdefs.ErrNotFound)
        }

        for i := 0; i < len(imgs); {
                if created, err := store.Create(ctx, imgs[i]); err != nil {
                        if !errdefs.IsAlreadyExists(err) {
                                return nil, err
                        }

                        updated, err := store.Update(ctx, imgs[i])
                        if err != nil {
                                // if image was removed, try create again
                                if errdefs.IsNotFound(err) {
                                        // Keep trying same image
                                        continue
                                }
                                return nil, err
                        }

                        imgs[i] = updated
                } else {
                        imgs[i] = created
                }

                i++
        }

        return imgs, nil
}

func (is *Store) Get(ctx context.Context, store images.Store) (images.Image, error) {
        return store.Get(ctx, is.imageName)
}

func (is *Store) Lookup(ctx context.Context, store images.Store) ([]images.Image, error) {
        var imgs []images.Image
        if is.imageName != "" {
                img, err := store.Get(ctx, is.imageName)
                if err != nil {
                        return nil, err
                }
                imgs = append(imgs, img)
        }
        for _, ref := range is.extraReferences {
                if ref.IsPrefix {
                        return nil, fmt.Errorf("prefix lookup on export not implemented: %w", errdefs.ErrNotImplemented)
                }
                img, err := store.Get(ctx, ref.Name)
                if err != nil {
                        return nil, err
                }
                imgs = append(imgs, img)
        }
        return imgs, nil
}

func (is *Store) UnpackPlatforms() []transfer.UnpackConfiguration {
        unpacks := make([]transfer.UnpackConfiguration, len(is.unpacks))
        for i, uc := range is.unpacks {
                unpacks[i].Snapshotter = uc.Snapshotter
                unpacks[i].Platform = uc.Platform
        }
        return unpacks
}

func (is *Store) MarshalAny(context.Context, streaming.StreamCreator) (typeurl.Any, error) {
        s := &transfertypes.ImageStore{
                Name:            is.imageName,
                Labels:          is.imageLabels,
                ManifestLimit:   uint32(is.manifestLimit),
                AllMetadata:     is.allMetadata,
                Platforms:       types.OCIPlatformToProto(is.platforms),
                ExtraReferences: referencesToProto(is.extraReferences),
                Unpacks:         unpackToProto(is.unpacks),
        }
        return typeurl.MarshalAny(s)
}

func (is *Store) UnmarshalAny(ctx context.Context, sm streaming.StreamGetter, a typeurl.Any) error {
        var s transfertypes.ImageStore
        if err := typeurl.UnmarshalTo(a, &s); err != nil {
                return err
        }

        is.imageName = s.Name
        is.imageLabels = s.Labels
        is.manifestLimit = int(s.ManifestLimit)
        is.allMetadata = s.AllMetadata
        is.platforms = types.OCIPlatformFromProto(s.Platforms)
        is.extraReferences = referencesFromProto(s.ExtraReferences)
        is.unpacks = unpackFromProto(s.Unpacks)

        return nil
}

func referencesToProto(references []Reference) []*transfertypes.ImageReference {
        ir := make([]*transfertypes.ImageReference, len(references))
        for i := range references {
                r := transfertypes.ImageReference{
                        Name:            references[i].Name,
                        IsPrefix:        references[i].IsPrefix,
                        AllowOverwrite:  references[i].AllowOverwrite,
                        AddDigest:       references[i].AddDigest,
                        SkipNamedDigest: references[i].SkipNamedDigest,
                }

                ir[i] = &r
        }
        return ir
}

func referencesFromProto(references []*transfertypes.ImageReference) []Reference {
        or := make([]Reference, len(references))
        for i := range references {
                or[i].Name = references[i].Name
                or[i].IsPrefix = references[i].IsPrefix
                or[i].AllowOverwrite = references[i].AllowOverwrite
                or[i].AddDigest = references[i].AddDigest
                or[i].SkipNamedDigest = references[i].SkipNamedDigest
        }
        return or
}
func unpackToProto(uc []transfer.UnpackConfiguration) []*transfertypes.UnpackConfiguration {
        auc := make([]*transfertypes.UnpackConfiguration, len(uc))
        for i := range uc {
                p := types.Platform{
                        OS:           uc[i].Platform.OS,
                        Architecture: uc[i].Platform.Architecture,
                        Variant:      uc[i].Platform.Variant,
                }
                auc[i] = &transfertypes.UnpackConfiguration{
                        Platform:    &p,
                        Snapshotter: uc[i].Snapshotter,
                }
        }
        return auc
}

func unpackFromProto(auc []*transfertypes.UnpackConfiguration) []transfer.UnpackConfiguration {
        uc := make([]transfer.UnpackConfiguration, len(auc))
        for i := range auc {
                uc[i].Snapshotter = auc[i].Snapshotter
                if auc[i].Platform != nil {
                        uc[i].Platform.OS = auc[i].Platform.OS
                        uc[i].Platform.Architecture = auc[i].Platform.Architecture
                        uc[i].Platform.Variant = auc[i].Platform.Variant
                }
        }
        return uc
}

func imageName(annotations map[string]string, cleanup func(string) string) string {
        name := annotations[images.AnnotationImageName]
        if name != "" {
                if cleanup != nil {
                        // containerd reference name should be full reference and not
                        // modified, if it is incomplete or does not match a specified
                        // prefix, do not use the reference
                        if cleanName := cleanup(name); cleanName != name {
                                name = ""
                        }
                }
                return name
        }
        name = annotations[ocispec.AnnotationRefName]
        if name != "" {
                if cleanup != nil {
                        name = cleanup(name)
                }
        }
        return name
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/transfer"
)

func (ts *localTransferService) exportStream(ctx context.Context, ig transfer.ImageGetter, is transfer.ImageExporter, tops *transfer.Config) error {
        ctx, done, err := ts.withLease(ctx)
        if err != nil {
                return err
        }
        defer done(ctx)

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: "Exporting",
                })
        }

        var imgs []images.Image
        if il, ok := ig.(transfer.ImageLookup); ok {
                imgs, err = il.Lookup(ctx, ts.images)
                if err != nil {
                        return err
                }
        } else {
                img, err := ig.Get(ctx, ts.images)
                if err != nil {
                        return err
                }
                imgs = append(imgs, img)
        }

        err = is.Export(ctx, ts.content, imgs)
        if err != nil {
                return err
        }

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: "Completed export",
                })
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "encoding/json"
        "fmt"

        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/containerd/v2/core/unpack"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

func (ts *localTransferService) importStream(ctx context.Context, i transfer.ImageImporter, is transfer.ImageStorer, tops *transfer.Config) error {
        ctx, done, err := ts.withLease(ctx)
        if err != nil {
                return err
        }
        defer done(ctx)

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: "Importing",
                })
        }

        index, err := i.Import(ctx, ts.content)
        if err != nil {
                return err
        }

        var (
                descriptors []ocispec.Descriptor
                handler     images.Handler
                unpacker    *unpack.Unpacker
        )

        // If save index, add index
        descriptors = append(descriptors, index)

        var handlerFunc images.HandlerFunc = func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                // Only save images at top level
                if desc.Digest != index.Digest {
                        return images.Children(ctx, ts.content, desc)
                }

                p, err := content.ReadBlob(ctx, ts.content, desc)
                if err != nil {
                        return nil, err
                }

                var idx ocispec.Index
                if err := json.Unmarshal(p, &idx); err != nil {
                        return nil, err
                }

                for _, m := range idx.Manifests {
                        m.Annotations = mergeMap(m.Annotations, map[string]string{"io.containerd.import.ref-source": "annotation"})
                        descriptors = append(descriptors, m)
                }

                return idx.Manifests, nil
        }

        if f, ok := is.(transfer.ImageFilterer); ok {
                handlerFunc = f.ImageFilter(handlerFunc, ts.content)
        }

        handler = images.Handlers(handlerFunc)

        // First find suitable platforms to unpack into
        // If image storer is also an unpacker type, i.e implemented UnpackPlatforms() func
        if iu, ok := is.(transfer.ImageUnpacker); ok {
                unpacks := iu.UnpackPlatforms()
                if len(unpacks) > 0 {
                        uopts := []unpack.UnpackerOpt{}
                        for _, u := range unpacks {
                                matched, mu := getSupportedPlatform(u, ts.config.UnpackPlatforms)
                                if matched {
                                        uopts = append(uopts, unpack.WithUnpackPlatform(mu))
                                }
                        }

                        if ts.config.DuplicationSuppressor != nil {
                                uopts = append(uopts, unpack.WithDuplicationSuppressor(ts.config.DuplicationSuppressor))
                        }
                        unpacker, err = unpack.NewUnpacker(ctx, ts.content, uopts...)
                        if err != nil {
                                return fmt.Errorf("unable to initialize unpacker: %w", err)
                        }
                        handler = unpacker.Unpack(handler)
                }
        }

        if err := images.WalkNotEmpty(ctx, handler, index); err != nil {
                if unpacker != nil {
                        // wait for unpacker to cleanup
                        unpacker.Wait()
                }
                // TODO: Handle Not Empty as a special case on the input
                return err
        }

        if unpacker != nil {
                if _, err = unpacker.Wait(); err != nil {
                        return err
                }
        }

        for _, desc := range descriptors {
                desc := desc
                imgs, err := is.Store(ctx, desc, ts.images)
                if err != nil {
                        if errdefs.IsNotFound(err) {
                                log.G(ctx).Infof("No images store for %s", desc.Digest)
                                continue
                        }
                        return err
                }

                if tops.Progress != nil {
                        for _, img := range imgs {
                                tops.Progress(transfer.Progress{
                                        Event: "saved",
                                        Name:  img.Name,
                                        Desc:  &desc,
                                })
                        }
                }
        }

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: "Completed import",
                })
        }

        return nil
}

func mergeMap(m1, m2 map[string]string) map[string]string {
        merged := make(map[string]string, len(m1)+len(m2))
        for k, v := range m1 {
                merged[k] = v
        }
        for k, v := range m2 {
                merged[k] = v
        }
        return merged
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "sort"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/log"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type ProgressTracker struct {
        root          string
        transferState string
        added         chan jobUpdate
        waitC         chan struct{}

        parents map[digest.Digest][]ocispec.Descriptor
        parentL sync.Mutex
}

type jobState uint8

const (
        jobAdded jobState = iota
        jobInProgress
        jobComplete
)

type jobStatus struct {
        state    jobState
        name     string
        parents  []string
        progress int64
        desc     ocispec.Descriptor
}

type jobUpdate struct {
        desc   ocispec.Descriptor
        exists bool
        //children []ocispec.Descriptor
}

type ActiveJobs interface {
        Status(string) (content.Status, bool)
}

type StatusTracker interface {
        Active(context.Context, ...string) (ActiveJobs, error)
        Check(context.Context, digest.Digest) (bool, error)
}

// NewProgressTracker tracks content download progress
func NewProgressTracker(root, transferState string) *ProgressTracker {
        return &ProgressTracker{
                root:          root,
                transferState: transferState,
                added:         make(chan jobUpdate, 1),
                waitC:         make(chan struct{}),
                parents:       map[digest.Digest][]ocispec.Descriptor{},
        }
}

func (j *ProgressTracker) HandleProgress(ctx context.Context, pf transfer.ProgressFunc, pt StatusTracker) {
        defer close(j.waitC)
        // Instead of ticker, just delay
        jobs := map[digest.Digest]*jobStatus{}
        tc := time.NewTicker(time.Millisecond * 300)
        defer tc.Stop()

        update := func() {
                // TODO: Filter by references
                active, err := pt.Active(ctx)
                if err != nil {
                        log.G(ctx).WithError(err).Error("failed to get statuses for progress")
                }
                for dgst, job := range jobs {
                        if job.state != jobComplete {
                                status, ok := active.Status(job.name)
                                if ok {
                                        if status.Offset > job.progress {
                                                pf(transfer.Progress{
                                                        Event:    j.transferState,
                                                        Name:     job.name,
                                                        Parents:  job.parents,
                                                        Progress: status.Offset,
                                                        Total:    status.Total,
                                                        Desc:     &job.desc,
                                                })
                                                job.progress = status.Offset
                                                job.state = jobInProgress
                                                jobs[dgst] = job
                                        }
                                } else {
                                        ok, err := pt.Check(ctx, job.desc.Digest)
                                        if err != nil {
                                                log.G(ctx).WithError(err).Error("failed to get statuses for progress")
                                        } else if ok {
                                                pf(transfer.Progress{
                                                        Event:    "complete",
                                                        Name:     job.name,
                                                        Parents:  job.parents,
                                                        Progress: job.desc.Size,
                                                        Total:    job.desc.Size,
                                                        Desc:     &job.desc,
                                                })

                                        }
                                        job.state = jobComplete
                                        jobs[dgst] = job
                                }
                        }
                }
        }
        for {
                select {
                case update := <-j.added:
                        job, ok := jobs[update.desc.Digest]
                        if !ok {

                                // Only captures the parents defined before,
                                // could handle parent updates in same thread
                                // if there is a synchronization issue
                                var parents []string
                                j.parentL.Lock()
                                for _, parent := range j.parents[update.desc.Digest] {
                                        parents = append(parents, remotes.MakeRefKey(ctx, parent))
                                }
                                j.parentL.Unlock()
                                if len(parents) == 0 {
                                        parents = []string{j.root}
                                }
                                name := remotes.MakeRefKey(ctx, update.desc)

                                job = &jobStatus{
                                        state:   jobAdded,
                                        name:    name,
                                        parents: parents,
                                        desc:    update.desc,
                                }
                                jobs[update.desc.Digest] = job
                                pf(transfer.Progress{
                                        Event:   "waiting",
                                        Name:    name,
                                        Parents: parents,
                                        //Digest:   desc.Digest.String(),
                                        Progress: 0,
                                        Total:    update.desc.Size,
                                        Desc:     &job.desc,
                                })
                        }
                        if update.exists {
                                pf(transfer.Progress{
                                        Event:    "already exists",
                                        Name:     remotes.MakeRefKey(ctx, update.desc),
                                        Progress: update.desc.Size,
                                        Total:    update.desc.Size,
                                        Desc:     &job.desc,
                                })
                                job.state = jobComplete
                                job.progress = job.desc.Size
                        }

                case <-tc.C:
                        update()
                        // Next timer?
                case <-ctx.Done():
                        update()
                        return
                }
        }
}

// Add adds a descriptor to be tracked
func (j *ProgressTracker) Add(desc ocispec.Descriptor) {
        if j == nil {
                return
        }
        j.added <- jobUpdate{
                desc: desc,
        }
}

func (j *ProgressTracker) MarkExists(desc ocispec.Descriptor) {
        if j == nil {
                return
        }
        j.added <- jobUpdate{
                desc:   desc,
                exists: true,
        }

}

// AddChildren adds hierarchy information
func (j *ProgressTracker) AddChildren(desc ocispec.Descriptor, children []ocispec.Descriptor) {
        if j == nil || len(children) == 0 {
                return
        }
        j.parentL.Lock()
        defer j.parentL.Unlock()
        for _, child := range children {
                j.parents[child.Digest] = append(j.parents[child.Digest], desc)
        }

}

func (j *ProgressTracker) Wait() {
        // timeout rather than rely on cancel
        timeout := time.After(10 * time.Second)
        select {
        case <-timeout:
        case <-j.waitC:
        }
}

type contentActive struct {
        active []content.Status
}

func (c *contentActive) Status(ref string) (content.Status, bool) {
        idx := sort.Search(len(c.active), func(i int) bool { return c.active[i].Ref >= ref })
        if idx < len(c.active) && c.active[idx].Ref == ref {
                return c.active[idx], true
        }

        return content.Status{}, false
}

type contentStatusTracker struct {
        cs content.Store
}

func NewContentStatusTracker(cs content.Store) StatusTracker {
        return &contentStatusTracker{
                cs: cs,
        }
}

func (c *contentStatusTracker) Active(ctx context.Context, _ ...string) (ActiveJobs, error) {
        active, err := c.cs.ListStatuses(ctx)
        if err != nil {
                log.G(ctx).WithError(err).Error("failed to list statuses for progress")
        }
        sort.Slice(active, func(i, j int) bool {
                return active[i].Ref < active[j].Ref
        })

        return &contentActive{
                active: active,
        }, nil
}

func (c *contentStatusTracker) Check(ctx context.Context, dgst digest.Digest) (bool, error) {
        _, err := c.cs.Info(ctx, dgst)
        if err == nil {
                return true, nil
        }
        return false, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "fmt"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/containerd/v2/core/unpack"
        "github.com/containerd/containerd/v2/defaults"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/sirupsen/logrus"
)

func (ts *localTransferService) pull(ctx context.Context, ir transfer.ImageFetcher, is transfer.ImageStorer, tops *transfer.Config) error {
        ctx, done, err := ts.withLease(ctx)
        if err != nil {
                return err
        }
        defer done(ctx)

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: fmt.Sprintf("Resolving from %s", ir),
                })
        }

        name, desc, err := ir.Resolve(ctx)
        if err != nil {
                return fmt.Errorf("failed to resolve image: %w", err)
        }
        if desc.MediaType == images.MediaTypeDockerSchema1Manifest {
                // Explicitly call out schema 1 as deprecated and not supported
                return fmt.Errorf("schema 1 image manifests are no longer supported: %w", errdefs.ErrInvalidArgument)
        }

        // Verify image before pulling.
        for vfName, vf := range ts.config.Verifiers {
                log := log.G(ctx).WithFields(logrus.Fields{
                        "name":     name,
                        "digest":   desc.Digest.String(),
                        "verifier": vfName,
                })
                log.Debug("Verifying image pull")

                jdg, err := vf.VerifyImage(ctx, name, desc)
                if err != nil {
                        log.WithError(err).Error("No judgement received from verifier")
                        return fmt.Errorf("blocking pull of %v with digest %v: image verifier %v returned error: %w", name, desc.Digest.String(), vfName, err)
                }
                log = log.WithFields(logrus.Fields{
                        "ok":     jdg.OK,
                        "reason": jdg.Reason,
                })

                if !jdg.OK {
                        log.Warn("Image verifier blocked pull")
                        return fmt.Errorf("image verifier %s blocked pull of %v with digest %v for reason: %v", vfName, name, desc.Digest.String(), jdg.Reason)
                }
                log.Debug("Image verifier allowed pull")
        }

        // TODO: Handle already exists
        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: fmt.Sprintf("Pulling from %s", ir),
                })
                tops.Progress(transfer.Progress{
                        Event: "fetching image content",
                        Name:  name,
                        //Digest: img.Target.Digest.String(),
                })
        }

        fetcher, err := ir.Fetcher(ctx, name)
        if err != nil {
                return fmt.Errorf("failed to get fetcher for %q: %w", name, err)
        }

        var (
                handler images.Handler

                baseHandlers []images.Handler

                unpacker *unpack.Unpacker

                // has a config media type bug (distribution#1622)
                hasMediaTypeBug1622 bool

                store           = ts.content
                progressTracker *ProgressTracker
        )

        ctx, cancel := context.WithCancel(ctx)
        if tops.Progress != nil {
                progressTracker = NewProgressTracker(name, "downloading") //Pass in first name as root
                go progressTracker.HandleProgress(ctx, tops.Progress, NewContentStatusTracker(store))
                defer progressTracker.Wait()
        }
        defer cancel()

        // Get all the children for a descriptor
        childrenHandler := images.ChildrenHandler(store)

        if f, ok := is.(transfer.ImageFilterer); ok {
                childrenHandler = f.ImageFilter(childrenHandler, store)
        }

        checkNeedsFix := images.HandlerFunc(
                func(_ context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                        // set to true if there is application/octet-stream media type
                        if desc.MediaType == docker.LegacyConfigMediaType {
                                hasMediaTypeBug1622 = true
                        }

                        return []ocispec.Descriptor{}, nil
                },
        )

        appendDistSrcLabelHandler, err := docker.AppendDistributionSourceLabel(store, name)
        if err != nil {
                return err
        }

        // Set up baseHandlers from service configuration if present or create a new one
        if ts.config.BaseHandlers != nil {
                baseHandlers = ts.config.BaseHandlers
        } else {
                baseHandlers = []images.Handler{}
        }

        if tops.Progress != nil {
                baseHandlers = append(baseHandlers, images.HandlerFunc(
                        func(_ context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                                progressTracker.Add(desc)

                                return []ocispec.Descriptor{}, nil
                        },
                ))

                baseChildrenHandler := childrenHandler
                childrenHandler = images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) (children []ocispec.Descriptor, err error) {
                        children, err = baseChildrenHandler(ctx, desc)
                        if err != nil {
                                return
                        }
                        progressTracker.AddChildren(desc, children)
                        return
                })
        }

        handler = images.Handlers(append(baseHandlers,
                fetchHandler(store, fetcher, progressTracker),
                checkNeedsFix,
                childrenHandler, // List children to track hierarchy
                appendDistSrcLabelHandler,
        )...)

        // First find suitable platforms to unpack into
        // If image storer is also an unpacker type, i.e implemented UnpackPlatforms() func
        if iu, ok := is.(transfer.ImageUnpacker); ok {
                unpacks := iu.UnpackPlatforms()
                if len(unpacks) > 0 {
                        uopts := []unpack.UnpackerOpt{}
                        // Only unpack if requested unpackconfig matches default/supported unpackconfigs
                        for _, u := range unpacks {
                                matched, mu := getSupportedPlatform(u, ts.config.UnpackPlatforms)
                                if matched {
                                        uopts = append(uopts, unpack.WithUnpackPlatform(mu))
                                }
                        }

                        if ts.limiterD != nil {
                                uopts = append(uopts, unpack.WithLimiter(ts.limiterD))
                        }

                        if ts.config.DuplicationSuppressor != nil {
                                uopts = append(uopts, unpack.WithDuplicationSuppressor(ts.config.DuplicationSuppressor))
                        }

                        unpacker, err = unpack.NewUnpacker(ctx, ts.content, uopts...)
                        if err != nil {
                                return fmt.Errorf("unable to initialize unpacker: %w", err)
                        }
                        handler = unpacker.Unpack(handler)
                }
        }

        if err := images.Dispatch(ctx, handler, ts.limiterD, desc); err != nil {
                if unpacker != nil {
                        // wait for unpacker to cleanup
                        unpacker.Wait()
                }
                return err
        }

        // NOTE(fuweid): unpacker defers blobs download. before create image
        // record in ImageService, should wait for unpacking(including blobs
        // download).
        if unpacker != nil {
                if _, err = unpacker.Wait(); err != nil {
                        return err
                }
                // TODO: Check results to make sure unpack was successful
        }

        if hasMediaTypeBug1622 {
                if desc, err = docker.ConvertManifest(ctx, store, desc); err != nil {
                        return err
                }
        }

        imgs, err := is.Store(ctx, desc, ts.images)
        if err != nil {
                return err
        }

        if tops.Progress != nil {
                for _, img := range imgs {
                        tops.Progress(transfer.Progress{
                                Event: "saved",
                                Name:  img.Name,
                        })
                }
        }

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: fmt.Sprintf("Completed pull from %s", ir),
                })
        }

        return nil
}

func fetchHandler(ingester content.Ingester, fetcher remotes.Fetcher, pt *ProgressTracker) images.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                        "digest":    desc.Digest,
                        "mediatype": desc.MediaType,
                        "size":      desc.Size,
                }))

                if desc.MediaType == images.MediaTypeDockerSchema1Manifest {
                        return nil, fmt.Errorf("%v not supported", desc.MediaType)
                }
                err := remotes.Fetch(ctx, ingester, fetcher, desc)
                if errdefs.IsAlreadyExists(err) {
                        pt.MarkExists(desc)
                        return nil, nil
                }
                return nil, err
        }
}

// getSupportedPlatform returns a matched platform comparing input UnpackConfiguration to the supported platform/snapshotter combinations
// If input platform didn't specify snapshotter, default will be used if there is a match on platform.
func getSupportedPlatform(uc transfer.UnpackConfiguration, supportedPlatforms []unpack.Platform) (bool, unpack.Platform) {
        var u unpack.Platform
        for _, sp := range supportedPlatforms {
                // If both platform and snapshotter match, return the supportPlatform
                // If platform matched and SnapshotterKey is empty, we assume client didn't pass SnapshotterKey
                // use default Snapshotter
                if sp.Platform.Match(uc.Platform) {
                        // Assume sp.SnapshotterKey is not empty
                        if uc.Snapshotter == sp.SnapshotterKey {
                                return true, sp
                        } else if uc.Snapshotter == "" && sp.SnapshotterKey == defaults.DefaultSnapshotter {
                                return true, sp
                        }
                }
        }
        return false, u
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "fmt"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func (ts *localTransferService) push(ctx context.Context, ig transfer.ImageGetter, p transfer.ImagePusher, tops *transfer.Config) error {
        /*
                // TODO: Platform matching
                if pushCtx.PlatformMatcher == nil {
                        if len(pushCtx.Platforms) > 0 {
                                ps, err := platforms.ParseAll(pushCtx.Platforms)
                                if err != nil {
                                        return err
                                }
                                pushCtx.PlatformMatcher = platforms.Any(ps...)
                        } else {
                                pushCtx.PlatformMatcher = platforms.All
                        }
                }
        */
        matcher := platforms.All
        // Filter push

        img, err := ig.Get(ctx, ts.images)
        if err != nil {
                return err
        }

        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: fmt.Sprintf("Pushing to %s", p),
                })
                tops.Progress(transfer.Progress{
                        Event: "pushing content",
                        Name:  img.Name,
                        //Digest: img.Target.Digest.String(),
                        Desc: &img.Target,
                })
        }

        var pusher remotes.Pusher
        pusher, err = p.Pusher(ctx, img.Target)
        if err != nil {
                return err
        }

        var wrapper func(images.Handler) images.Handler

        ctx, cancel := context.WithCancel(ctx)
        if tops.Progress != nil {
                progressTracker := NewProgressTracker(img.Name, "uploading") //Pass in first name as root

                p := newProgressPusher(pusher, progressTracker)
                go progressTracker.HandleProgress(ctx, tops.Progress, p)
                defer progressTracker.Wait()
                wrapper = p.WrapHandler
                pusher = p
        }
        defer cancel()

        // TODO: Add handler to track parents
        /*
                // TODO: Add handlers
                if len(pushCtx.BaseHandlers) > 0 {
                        wrapper = func(h images.Handler) images.Handler {
                                h = images.Handlers(append(pushCtx.BaseHandlers, h)...)
                                if pushCtx.HandlerWrapper != nil {
                                        h = pushCtx.HandlerWrapper(h)
                                }
                                return h
                        }
                } else if pushCtx.HandlerWrapper != nil {
                        wrapper = pushCtx.HandlerWrapper
                }
        */
        if err := remotes.PushContent(ctx, pusher, img.Target, ts.content, ts.limiterU, matcher, wrapper); err != nil {
                return err
        }
        if tops.Progress != nil {
                tops.Progress(transfer.Progress{
                        Event: "pushed content",
                        Name:  img.Name,
                        //Digest: img.Target.Digest.String(),
                        Desc: &img.Target,
                })
                tops.Progress(transfer.Progress{
                        Event: fmt.Sprintf("Completed push to %s", p),
                        Desc:  &img.Target,
                })
        }

        return nil
}

type progressPusher struct {
        remotes.Pusher
        progress *ProgressTracker

        status *pushStatus
}

type pushStatus struct {
        l        sync.Mutex
        statuses map[string]content.Status
        complete map[digest.Digest]struct{}
}

func newProgressPusher(pusher remotes.Pusher, progress *ProgressTracker) *progressPusher {
        return &progressPusher{
                Pusher:   pusher,
                progress: progress,
                status: &pushStatus{
                        statuses: map[string]content.Status{},
                        complete: map[digest.Digest]struct{}{},
                },
        }

}

func (p *progressPusher) WrapHandler(h images.Handler) images.Handler {
        return images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error) {
                p.progress.Add(desc)
                subdescs, err = h.Handle(ctx, desc)
                p.progress.AddChildren(desc, subdescs)
                return
        })
}

func (p *progressPusher) Push(ctx context.Context, d ocispec.Descriptor) (content.Writer, error) {
        ref := remotes.MakeRefKey(ctx, d)
        p.status.add(ref, d)
        cw, err := p.Pusher.Push(ctx, d)
        if err != nil {
                if errdefs.IsAlreadyExists(err) {
                        p.progress.MarkExists(d)
                        p.status.markComplete(ref, d)
                }
                return nil, err
        }

        return &progressWriter{
                Writer:   cw,
                ref:      ref,
                desc:     d,
                status:   p.status,
                progress: p.progress,
        }, nil
}

func (ps *pushStatus) update(ref string, delta int) {
        ps.l.Lock()
        status, ok := ps.statuses[ref]
        if ok {
                if delta > 0 {
                        status.Offset += int64(delta)
                } else if delta < 0 {
                        status.Offset = 0
                }
                ps.statuses[ref] = status
        }
        ps.l.Unlock()
}

func (ps *pushStatus) add(ref string, d ocispec.Descriptor) {
        status := content.Status{
                Ref:       ref,
                Offset:    0,
                Total:     d.Size,
                StartedAt: time.Now(),
        }
        ps.l.Lock()
        _, ok := ps.statuses[ref]
        _, complete := ps.complete[d.Digest]
        if !ok && !complete {
                ps.statuses[ref] = status
        }
        ps.l.Unlock()
}
func (ps *pushStatus) markComplete(ref string, d ocispec.Descriptor) {
        ps.l.Lock()
        _, ok := ps.statuses[ref]
        if ok {
                delete(ps.statuses, ref)
        }
        ps.complete[d.Digest] = struct{}{}
        ps.l.Unlock()

}

func (ps *pushStatus) Status(name string) (content.Status, bool) {
        ps.l.Lock()
        status, ok := ps.statuses[name]
        ps.l.Unlock()
        return status, ok
}

func (ps *pushStatus) Check(ctx context.Context, dgst digest.Digest) (bool, error) {
        ps.l.Lock()
        _, ok := ps.complete[dgst]
        ps.l.Unlock()
        return ok, nil
}

func (p *progressPusher) Active(ctx context.Context, _ ...string) (ActiveJobs, error) {
        return p.status, nil
}

func (p *progressPusher) Check(ctx context.Context, dgst digest.Digest) (bool, error) {
        return p.status.Check(ctx, dgst)
}

type progressWriter struct {
        content.Writer
        ref      string
        desc     ocispec.Descriptor
        status   *pushStatus
        progress *ProgressTracker
}

func (pw *progressWriter) Write(p []byte) (n int, err error) {
        n, err = pw.Writer.Write(p)
        if err != nil {
                // TODO: Handle reset error to reset progress
                return
        }
        pw.status.update(pw.ref, n)
        return
}
func (pw *progressWriter) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) error {
        err := pw.Writer.Commit(ctx, size, expected, opts...)
        if err != nil {
                if errdefs.IsAlreadyExists(err) {
                        pw.progress.MarkExists(pw.desc)
                }
                // TODO: Handle reset error to reset progress
        }
        pw.status.markComplete(pw.ref, pw.desc)
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"

        "github.com/containerd/containerd/v2/core/transfer"
)

func (ts *localTransferService) tag(ctx context.Context, ig transfer.ImageGetter, is transfer.ImageStorer, tops *transfer.Config) error {
        ctx, done, err := ts.withLease(ctx)
        if err != nil {
                return err
        }
        defer done(ctx)

        img, err := ig.Get(ctx, ts.images)
        if err != nil {
                return err
        }

        _, err = is.Store(ctx, img.Target, ts.images)
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "fmt"
        "io"
        "time"

        "github.com/containerd/typeurl/v2"
        "golang.org/x/sync/semaphore"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/containerd/v2/core/unpack"
        "github.com/containerd/containerd/v2/internal/kmutex"
        "github.com/containerd/containerd/v2/pkg/imageverifier"
        "github.com/containerd/errdefs"
)

type localTransferService struct {
        content content.Store
        images  images.Store
        // limiter for upload
        limiterU *semaphore.Weighted
        // limiter for download operation
        limiterD *semaphore.Weighted
        config   TransferConfig
}

func NewTransferService(cs content.Store, is images.Store, tc TransferConfig) transfer.Transferrer {
        ts := &localTransferService{
                content: cs,
                images:  is,
                config:  tc,
        }
        if tc.MaxConcurrentUploadedLayers > 0 {
                ts.limiterU = semaphore.NewWeighted(int64(tc.MaxConcurrentUploadedLayers))
        }
        if tc.MaxConcurrentDownloads > 0 {
                ts.limiterD = semaphore.NewWeighted(int64(tc.MaxConcurrentDownloads))
        }
        return ts
}

func (ts *localTransferService) Transfer(ctx context.Context, src interface{}, dest interface{}, opts ...transfer.Opt) error {
        topts := &transfer.Config{}
        for _, opt := range opts {
                opt(topts)
        }

        // Figure out matrix of whether source destination combination is supported
        switch s := src.(type) {
        case transfer.ImageFetcher:
                switch d := dest.(type) {
                case transfer.ImageStorer:
                        return ts.pull(ctx, s, d, topts)
                }
        case transfer.ImageGetter:
                switch d := dest.(type) {
                case transfer.ImagePusher:
                        return ts.push(ctx, s, d, topts)
                case transfer.ImageExporter:
                        return ts.exportStream(ctx, s, d, topts)
                case transfer.ImageStorer:
                        return ts.tag(ctx, s, d, topts)
                }
        case transfer.ImageImporter:
                switch d := dest.(type) {
                case transfer.ImageExportStreamer:
                        return ts.echo(ctx, s, d, topts)
                case transfer.ImageStorer:
                        // TODO: verify imports with ImageVerifiers?
                        return ts.importStream(ctx, s, d, topts)
                }
        }
        return fmt.Errorf("unable to transfer from %s to %s: %w", name(src), name(dest), errdefs.ErrNotImplemented)
}

func name(t interface{}) string {
        switch s := t.(type) {
        case fmt.Stringer:
                return s.String()
        case typeurl.Any:
                return s.GetTypeUrl()
        default:
                return fmt.Sprintf("%T", t)
        }
}

// echo is mostly used for testing, it implements an import->export which is
// a no-op which only roundtrips the bytes.
func (ts *localTransferService) echo(ctx context.Context, i transfer.ImageImporter, e transfer.ImageExportStreamer, tops *transfer.Config) error {
        iis, ok := i.(transfer.ImageImportStreamer)
        if !ok {
                return fmt.Errorf("echo requires access to raw stream: %w", errdefs.ErrNotImplemented)
        }
        r, _, err := iis.ImportStream(ctx)
        if err != nil {
                return err
        }
        wc, _, err := e.ExportStream(ctx)
        if err != nil {
                return err
        }

        // TODO: Use fixed buffer? Send write progress?
        _, err = io.Copy(wc, r)
        if werr := wc.Close(); werr != nil && err == nil {
                err = werr
        }
        return err
}

// WithLease attaches a lease on the context
func (ts *localTransferService) withLease(ctx context.Context, opts ...leases.Opt) (context.Context, func(context.Context) error, error) {
        nop := func(context.Context) error { return nil }

        _, ok := leases.FromContext(ctx)
        if ok {
                return ctx, nop, nil
        }

        ls := ts.config.Leases
        if ls == nil {
                return ctx, nop, nil
        }

        if len(opts) == 0 {
                // Use default lease configuration if no options provided
                opts = []leases.Opt{
                        leases.WithRandomID(),
                        leases.WithExpiration(24 * time.Hour),
                }
        }

        l, err := ls.Create(ctx, opts...)
        if err != nil {
                return ctx, nop, err
        }

        ctx = leases.WithLease(ctx, l.ID)
        return ctx, func(ctx context.Context) error {
                return ls.Delete(ctx, l)
        }, nil
}

type TransferConfig struct {
        // Leases manager is used to create leases during operations if none, exists
        Leases leases.Manager

        // MaxConcurrentDownloads is the max concurrent content downloads for pull.
        MaxConcurrentDownloads int
        // MaxConcurrentUploadedLayers is the max concurrent uploads for push
        MaxConcurrentUploadedLayers int

        // DuplicationSuppressor is used to make sure that there is only one
        // in-flight fetch request or unpack handler for a given descriptor's
        // digest or chain ID.
        DuplicationSuppressor kmutex.KeyedLocker

        // BaseHandlers are a set of handlers which get are called on dispatch.
        // These handlers always get called before any operation specific
        // handlers.
        BaseHandlers []images.Handler

        // UnpackPlatforms are used to specify supported combination of platforms and snapshotters
        UnpackPlatforms []unpack.Platform

        // ImageVerifiers verify the image before saving into the image store.
        Verifiers map[string]imageverifier.ImageVerifier

        // RegistryConfigPath is a path to the root directory containing registry-specific configurations
        RegistryConfigPath string
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package plugins

import (
        "fmt"
        "reflect"
        "sync"

        "github.com/containerd/errdefs"
        "github.com/containerd/typeurl/v2"
)

var register = struct {
        sync.RWMutex
        r map[string]reflect.Type
}{}

func Register(apiObject, transferObject interface{}) {
        url, err := typeurl.TypeURL(apiObject)
        if err != nil {
                panic(err)
        }
        // Lock
        register.Lock()
        defer register.Unlock()
        if register.r == nil {
                register.r = map[string]reflect.Type{}
        }
        if _, ok := register.r[url]; ok {
                panic(fmt.Sprintf("url already registered: %v", url))
        }
        t := reflect.TypeOf(transferObject)
        if t.Kind() == reflect.Ptr {
                t = t.Elem()
        }
        register.r[url] = t
}

func ResolveType(any typeurl.Any) (interface{}, error) {
        register.RLock()
        defer register.RUnlock()
        if register.r != nil {
                if t, ok := register.r[any.GetTypeUrl()]; ok {
                        return reflect.New(t).Interface(), nil
                }
        }
        return nil, fmt.Errorf("%v not registered: %w", any.GetTypeUrl(), errdefs.ErrNotFound)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package proxy

import (
        "context"
        "errors"
        "fmt"
        "io"

        "google.golang.org/grpc"
        "google.golang.org/protobuf/types/known/anypb"
        "google.golang.org/protobuf/types/known/emptypb"

        transferapi "github.com/containerd/containerd/api/services/transfer/v1"
        transfertypes "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/core/transfer"
        tstreaming "github.com/containerd/containerd/v2/core/transfer/streaming"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/ttrpc"
        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type proxyTransferrer struct {
        client        transferapi.TTRPCTransferService
        streamCreator streaming.StreamCreator
}

// NewTransferrer returns a new transferrer which can communicate over a GRPC
// or TTRPC connection using the containerd transfer API
func NewTransferrer(client any, sc streaming.StreamCreator) transfer.Transferrer {
        switch c := client.(type) {
        case transferapi.TransferClient:
                return &proxyTransferrer{
                        client:        convertClient{c},
                        streamCreator: sc,
                }
        case grpc.ClientConnInterface:
                return &proxyTransferrer{
                        client:        convertClient{transferapi.NewTransferClient(c)},
                        streamCreator: sc,
                }
        case transferapi.TTRPCTransferService:
                return &proxyTransferrer{
                        client:        c,
                        streamCreator: sc,
                }
        case *ttrpc.Client:
                return &proxyTransferrer{
                        client:        transferapi.NewTTRPCTransferClient(c),
                        streamCreator: sc,
                }
        case transfer.Transferrer:
                return c
        default:
                panic(fmt.Errorf("unsupported stream client %T: %w", client, errdefs.ErrNotImplemented))
        }
}

type convertClient struct {
        transferapi.TransferClient
}

func (c convertClient) Transfer(ctx context.Context, r *transferapi.TransferRequest) (*emptypb.Empty, error) {
        return c.TransferClient.Transfer(ctx, r)
}

func (p *proxyTransferrer) Transfer(ctx context.Context, src interface{}, dst interface{}, opts ...transfer.Opt) error {
        o := &transfer.Config{}
        for _, opt := range opts {
                opt(o)
        }
        apiOpts := &transferapi.TransferOptions{}
        if o.Progress != nil {
                sid := tstreaming.GenerateID("progress")
                stream, err := p.streamCreator.Create(ctx, sid)
                if err != nil {
                        return err
                }
                apiOpts.ProgressStream = sid
                go func() {
                        for {
                                a, err := stream.Recv()
                                if err != nil {
                                        if !errors.Is(err, io.EOF) {
                                                log.G(ctx).WithError(err).Error("progress stream failed to recv")
                                        }
                                        return
                                }
                                i, err := typeurl.UnmarshalAny(a)
                                if err != nil {
                                        log.G(ctx).WithError(err).Warnf("failed to unmarshal progress object: %v", a.GetTypeUrl())
                                }
                                switch v := i.(type) {
                                case *transfertypes.Progress:
                                        var descp *ocispec.Descriptor
                                        if v.Desc != nil {
                                                desc := oci.DescriptorFromProto(v.Desc)
                                                descp = &desc
                                        }
                                        o.Progress(transfer.Progress{
                                                Event:    v.Event,
                                                Name:     v.Name,
                                                Parents:  v.Parents,
                                                Progress: v.Progress,
                                                Total:    v.Total,
                                                Desc:     descp,
                                        })
                                default:
                                        log.G(ctx).Warnf("unhandled progress object %T: %v", i, a.GetTypeUrl())
                                }
                        }
                }()
        }
        asrc, err := p.marshalAny(ctx, src)
        if err != nil {
                return err
        }
        adst, err := p.marshalAny(ctx, dst)
        if err != nil {
                return err
        }
        req := &transferapi.TransferRequest{
                Source: &anypb.Any{
                        TypeUrl: asrc.GetTypeUrl(),
                        Value:   asrc.GetValue(),
                },
                Destination: &anypb.Any{
                        TypeUrl: adst.GetTypeUrl(),
                        Value:   adst.GetValue(),
                },
                Options: apiOpts,
        }
        _, err = p.client.Transfer(ctx, req)
        return err
}
func (p *proxyTransferrer) marshalAny(ctx context.Context, i interface{}) (typeurl.Any, error) {
        switch m := i.(type) {
        case streamMarshaler:
                return m.MarshalAny(ctx, p.streamCreator)
        }
        return typeurl.MarshalAny(i)
}

type streamMarshaler interface {
        MarshalAny(context.Context, streaming.StreamCreator) (typeurl.Any, error)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package registry

import (
        "context"
        "errors"
        "fmt"
        "io"
        "net/http"
        "strings"
        "sync"

        transfertypes "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/containerd/v2/core/remotes/docker/config"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/core/transfer"
        "github.com/containerd/containerd/v2/core/transfer/plugins"
        tstreaming "github.com/containerd/containerd/v2/core/transfer/streaming"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func init() {
        // TODO: Move this to separate package?
        plugins.Register(&transfertypes.OCIRegistry{}, &OCIRegistry{})
}

type registryOpts struct {
        headers       http.Header
        creds         CredentialHelper
        hostDir       string
        defaultScheme string
}

// Opt sets registry-related configurations.
type Opt func(o *registryOpts) error

// WithHeaders configures HTTP request header fields sent by the resolver.
func WithHeaders(headers http.Header) Opt {
        return func(o *registryOpts) error {
                o.headers = headers
                return nil
        }
}

// WithCredentials configures a helper that provides credentials for a host.
func WithCredentials(creds CredentialHelper) Opt {
        return func(o *registryOpts) error {
                o.creds = creds
                return nil
        }
}

// WithHostDir specifies the host configuration directory.
func WithHostDir(hostDir string) Opt {
        return func(o *registryOpts) error {
                o.hostDir = hostDir
                return nil
        }
}

// WithDefaultScheme specifies the default scheme for registry configuration
func WithDefaultScheme(s string) Opt {
        return func(o *registryOpts) error {
                o.defaultScheme = s
                return nil
        }
}

// NewOCIRegistry initializes with hosts, authorizer callback, and headers
func NewOCIRegistry(ctx context.Context, ref string, opts ...Opt) (*OCIRegistry, error) {
        var ropts registryOpts
        for _, o := range opts {
                if err := o(&ropts); err != nil {
                        return nil, err
                }
        }
        hostOptions := config.HostOptions{}
        if ropts.hostDir != "" {
                hostOptions.HostDir = config.HostDirFromRoot(ropts.hostDir)
        }
        if ropts.creds != nil {
                // TODO: Support bearer
                hostOptions.Credentials = func(host string) (string, string, error) {
                        c, err := ropts.creds.GetCredentials(context.Background(), ref, host)
                        if err != nil {
                                return "", "", err
                        }

                        return c.Username, c.Secret, nil
                }
        }
        if ropts.defaultScheme != "" {
                hostOptions.DefaultScheme = ropts.defaultScheme
        }
        resolver := docker.NewResolver(docker.ResolverOptions{
                Hosts:   config.ConfigureHosts(ctx, hostOptions),
                Headers: ropts.headers,
        })
        return &OCIRegistry{
                reference:     ref,
                headers:       ropts.headers,
                creds:         ropts.creds,
                resolver:      resolver,
                hostDir:       ropts.hostDir,
                defaultScheme: ropts.defaultScheme,
        }, nil
}

// From stream
type CredentialHelper interface {
        GetCredentials(ctx context.Context, ref, host string) (Credentials, error)
}

type Credentials struct {
        Host     string
        Username string
        Secret   string
        Header   string
}

// OCI
type OCIRegistry struct {
        reference string

        headers http.Header
        creds   CredentialHelper

        resolver remotes.Resolver

        hostDir string

        defaultScheme string

        // This could be an interface which returns resolver?
        // Resolver could also be a plug-able interface, to call out to a program to fetch?
}

func (r *OCIRegistry) String() string {
        return fmt.Sprintf("OCI Registry (%s)", r.reference)
}

func (r *OCIRegistry) Image() string {
        return r.reference
}

func (r *OCIRegistry) Resolve(ctx context.Context) (name string, desc ocispec.Descriptor, err error) {
        return r.resolver.Resolve(ctx, r.reference)
}

func (r *OCIRegistry) Fetcher(ctx context.Context, ref string) (transfer.Fetcher, error) {
        return r.resolver.Fetcher(ctx, ref)
}

func (r *OCIRegistry) Pusher(ctx context.Context, desc ocispec.Descriptor) (transfer.Pusher, error) {
        var ref = r.reference
        // Annotate ref with digest to push only push tag for single digest
        if !strings.Contains(ref, "@") {
                ref = ref + "@" + desc.Digest.String()
        }
        return r.resolver.Pusher(ctx, ref)
}

func (r *OCIRegistry) MarshalAny(ctx context.Context, sm streaming.StreamCreator) (typeurl.Any, error) {
        res := &transfertypes.RegistryResolver{}
        if r.headers != nil {
                res.Headers = map[string]string{}
                for k := range r.headers {
                        res.Headers[k] = r.headers.Get(k)
                }
        }
        if r.creds != nil {
                sid := tstreaming.GenerateID("creds")
                stream, err := sm.Create(ctx, sid)
                if err != nil {
                        return nil, err
                }
                go func() {
                        // Check for context cancellation as well
                        for {
                                select {
                                case <-ctx.Done():
                                        return
                                default:
                                }

                                req, err := stream.Recv()
                                if err != nil {
                                        // If not EOF, log error
                                        return
                                }

                                var s transfertypes.AuthRequest
                                if err := typeurl.UnmarshalTo(req, &s); err != nil {
                                        log.G(ctx).WithError(err).Error("failed to unmarshal credential request")
                                        continue
                                }
                                creds, err := r.creds.GetCredentials(ctx, s.Reference, s.Host)
                                if err != nil {
                                        log.G(ctx).WithError(err).Error("failed to get credentials")
                                        continue
                                }
                                var resp transfertypes.AuthResponse
                                if creds.Header != "" {
                                        resp.AuthType = transfertypes.AuthType_HEADER
                                        resp.Secret = creds.Header
                                } else if creds.Username != "" {
                                        resp.AuthType = transfertypes.AuthType_CREDENTIALS
                                        resp.Username = creds.Username
                                        resp.Secret = creds.Secret
                                } else {
                                        resp.AuthType = transfertypes.AuthType_REFRESH
                                        resp.Secret = creds.Secret
                                }

                                a, err := typeurl.MarshalAny(&resp)
                                if err != nil {
                                        log.G(ctx).WithError(err).Error("failed to marshal credential response")
                                        continue
                                }

                                if err := stream.Send(a); err != nil {
                                        if !errors.Is(err, io.EOF) {
                                                log.G(ctx).WithError(err).Error("unexpected send failure")
                                        }
                                        return
                                }
                        }

                }()
                res.AuthStream = sid
        }
        res.HostDir = r.hostDir
        res.DefaultScheme = r.defaultScheme
        s := &transfertypes.OCIRegistry{
                Reference: r.reference,
                Resolver:  res,
        }

        return typeurl.MarshalAny(s)
}

func (r *OCIRegistry) UnmarshalAny(ctx context.Context, sm streaming.StreamGetter, a typeurl.Any) error {
        var s transfertypes.OCIRegistry
        if err := typeurl.UnmarshalTo(a, &s); err != nil {
                return err
        }

        hostOptions := config.HostOptions{}
        if s.Resolver != nil {
                if s.Resolver.HostDir != "" {
                        hostOptions.HostDir = config.HostDirFromRoot(s.Resolver.HostDir)
                }
                if s.Resolver.DefaultScheme != "" {
                        hostOptions.DefaultScheme = s.Resolver.DefaultScheme
                }
                if sid := s.Resolver.AuthStream; sid != "" {
                        stream, err := sm.Get(ctx, sid)
                        if err != nil {
                                log.G(ctx).WithError(err).WithField("stream", sid).Debug("failed to get auth stream")
                                return err
                        }
                        r.creds = &credCallback{
                                stream: stream,
                        }
                        hostOptions.Credentials = func(host string) (string, string, error) {
                                c, err := r.creds.GetCredentials(context.Background(), s.Reference, host)
                                if err != nil {
                                        return "", "", err
                                }

                                return c.Username, c.Secret, nil
                        }
                }
                r.headers = http.Header{}
                for k, v := range s.Resolver.Headers {
                        r.headers.Add(k, v)
                }
        }

        r.reference = s.Reference
        r.resolver = docker.NewResolver(docker.ResolverOptions{
                Hosts:   config.ConfigureHosts(ctx, hostOptions),
                Headers: r.headers,
        })

        return nil
}

type credCallback struct {
        sync.Mutex
        stream streaming.Stream
}

func (cc *credCallback) GetCredentials(ctx context.Context, ref, host string) (Credentials, error) {
        cc.Lock()
        defer cc.Unlock()

        ar := &transfertypes.AuthRequest{
                Host:      host,
                Reference: ref,
        }
        anyType, err := typeurl.MarshalAny(ar)
        if err != nil {
                return Credentials{}, err
        }
        if err := cc.stream.Send(anyType); err != nil {
                return Credentials{}, err
        }
        resp, err := cc.stream.Recv()
        if err != nil {
                return Credentials{}, err
        }
        var s transfertypes.AuthResponse
        if err := typeurl.UnmarshalTo(resp, &s); err != nil {
                return Credentials{}, err
        }
        creds := Credentials{
                Host: host,
        }
        switch s.AuthType {
        case transfertypes.AuthType_CREDENTIALS:
                creds.Username = s.Username
                creds.Secret = s.Secret
        case transfertypes.AuthType_REFRESH:
                creds.Secret = s.Secret
        case transfertypes.AuthType_HEADER:
                creds.Header = s.Secret
        }

        return creds, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package streaming

import (
        "context"
        "errors"
        "fmt"
        "io"

        transferapi "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/typeurl/v2"
)

type readByteStream struct {
        ctx       context.Context
        stream    streaming.Stream
        window    int32
        updated   chan struct{}
        errCh     chan error
        remaining []byte
}

func ReadByteStream(ctx context.Context, stream streaming.Stream) io.ReadCloser {
        rbs := &readByteStream{
                ctx:     ctx,
                stream:  stream,
                window:  0,
                errCh:   make(chan error),
                updated: make(chan struct{}, 1),
        }
        go func() {
                for {
                        if rbs.window >= windowSize {
                                select {
                                case <-ctx.Done():
                                        return
                                case <-rbs.updated:
                                        continue
                                }
                        }
                        update := &transferapi.WindowUpdate{
                                Update: windowSize,
                        }
                        anyType, err := typeurl.MarshalAny(update)
                        if err != nil {
                                rbs.errCh <- err
                                return
                        }
                        if err := stream.Send(anyType); err == nil {
                                rbs.window += windowSize
                        } else if !errors.Is(err, io.EOF) {
                                rbs.errCh <- err
                        }
                }

        }()
        return rbs
}

func (r *readByteStream) Read(p []byte) (n int, err error) {
        plen := len(p)
        if len(r.remaining) > 0 {
                copied := copy(p, r.remaining)
                if len(r.remaining) > plen {
                        r.remaining = r.remaining[plen:]
                } else {
                        r.remaining = nil
                }
                return copied, nil
        }
        select {
        case <-r.ctx.Done():
                return 0, r.ctx.Err()
        case err := <-r.errCh:
                return 0, err
        default:
        }
        anyType, err := r.stream.Recv()
        if err != nil {
                return 0, err
        }
        i, err := typeurl.UnmarshalAny(anyType)
        if err != nil {
                return 0, err
        }
        switch v := i.(type) {
        case *transferapi.Data:
                n := copy(p, v.Data)
                if len(v.Data) > plen {
                        r.remaining = v.Data[plen:]
                }
                r.window = r.window - int32(n)
                if r.window < windowSize {
                        r.updated <- struct{}{}
                }
                return n, nil
        default:
                return 0, fmt.Errorf("stream received error type %v", v)
        }

}

func (r *readByteStream) Close() error {
        return r.stream.Close()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package streaming

import (
        "context"
        "crypto/rand"
        "encoding/base64"
        "errors"
        "fmt"
        "io"
        "sync"
        "time"

        transferapi "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
)

const maxRead = 32 * 1024
const windowSize = 2 * maxRead

var bufPool = &sync.Pool{
        New: func() interface{} {
                buffer := make([]byte, maxRead)
                return &buffer
        },
}

func SendStream(ctx context.Context, r io.Reader, stream streaming.Stream) {
        window := make(chan int32)
        go func() {
                defer close(window)
                for {
                        select {
                        case <-ctx.Done():
                                return
                        default:
                        }

                        anyType, err := stream.Recv()
                        if err != nil {
                                if !errors.Is(err, io.EOF) && !errors.Is(err, context.Canceled) {
                                        log.G(ctx).WithError(err).Error("send stream ended without EOF")
                                }
                                return
                        }
                        i, err := typeurl.UnmarshalAny(anyType)
                        if err != nil {
                                log.G(ctx).WithError(err).Error("failed to unmarshal stream object")
                                continue
                        }
                        switch v := i.(type) {
                        case *transferapi.WindowUpdate:
                                select {
                                case <-ctx.Done():
                                        return
                                case window <- v.Update:
                                }
                        default:
                                log.G(ctx).Errorf("unexpected stream object of type %T", i)
                        }
                }
        }()
        go func() {
                defer stream.Close()

                buf := bufPool.Get().(*[]byte)
                defer bufPool.Put(buf)

                var remaining int32

                for {
                        if remaining > 0 {
                                // Don't wait for window update since there are remaining
                                select {
                                case <-ctx.Done():
                                        // TODO: Send error message on stream before close to allow remote side to return error
                                        return
                                case update := <-window:
                                        remaining += update
                                default:
                                }
                        } else {
                                // Block until window updated
                                select {
                                case <-ctx.Done():
                                        // TODO: Send error message on stream before close to allow remote side to return error
                                        return
                                case update := <-window:
                                        remaining = update
                                }
                        }
                        var max int32 = maxRead
                        if max > remaining {
                                max = remaining
                        }
                        b := (*buf)[:max]
                        n, err := r.Read(b)
                        if err != nil {
                                if !errors.Is(err, io.EOF) {
                                        log.G(ctx).WithError(err).Errorf("failed to read stream source")
                                        // TODO: Send error message on stream before close to allow remote side to return error
                                }
                                return
                        }
                        remaining = remaining - int32(n)

                        data := &transferapi.Data{
                                Data: b[:n],
                        }
                        anyType, err := typeurl.MarshalAny(data)
                        if err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to marshal data for send")
                                // TODO: Send error message on stream before close to allow remote side to return error
                                return
                        }
                        if err := stream.Send(anyType); err != nil {
                                log.G(ctx).WithError(err).Errorf("send failed")
                                return
                        }
                }
        }()
}

func ReceiveStream(ctx context.Context, stream streaming.Stream) io.Reader {
        r, w := io.Pipe()
        go func() {
                defer stream.Close()
                var window int32
                for {
                        var werr error
                        if window < windowSize {
                                update := &transferapi.WindowUpdate{
                                        Update: windowSize,
                                }
                                anyType, err := typeurl.MarshalAny(update)
                                if err != nil {
                                        w.CloseWithError(fmt.Errorf("failed to marshal window update: %w", err))
                                        return
                                }
                                // check window update error after recv, stream may be complete
                                if werr = stream.Send(anyType); werr == nil {
                                        window += windowSize
                                } else if errors.Is(werr, io.EOF) {
                                        // TODO: Why does send return EOF here
                                        werr = nil
                                }
                        }
                        anyType, err := stream.Recv()
                        if err != nil {
                                if errors.Is(err, io.EOF) || errors.Is(err, context.Canceled) {
                                        err = nil
                                } else {
                                        err = fmt.Errorf("received failed: %w", err)
                                }
                                w.CloseWithError(err)
                                return
                        } else if werr != nil {
                                // Try receive before erroring out
                                w.CloseWithError(fmt.Errorf("failed to send window update: %w", werr))
                                return
                        }
                        i, err := typeurl.UnmarshalAny(anyType)
                        if err != nil {
                                w.CloseWithError(fmt.Errorf("failed to unmarshal received object: %w", err))
                                return
                        }
                        switch v := i.(type) {
                        case *transferapi.Data:
                                n, err := w.Write(v.Data)
                                if err != nil {
                                        w.CloseWithError(fmt.Errorf("failed to unmarshal received object: %w", err))
                                        // Close will error out sender
                                        return
                                }
                                window = window - int32(n)
                        // TODO: Handle error case
                        default:
                                log.G(ctx).Warnf("Ignoring unknown stream object of type %T", i)
                                continue
                        }
                }

        }()

        return r
}

func GenerateID(prefix string) string {
        t := time.Now()
        var b [3]byte
        rand.Read(b[:])
        return fmt.Sprintf("%s-%d-%s", prefix, t.Nanosecond(), base64.URLEncoding.EncodeToString(b[:]))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package streaming

import (
        "context"
        "errors"
        "io"
        "sync/atomic"

        transferapi "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
)

func WriteByteStream(ctx context.Context, stream streaming.Stream) io.WriteCloser {
        wbs := &writeByteStream{
                ctx:     ctx,
                stream:  stream,
                updated: make(chan struct{}, 1),
        }
        go func() {
                for {
                        select {
                        case <-ctx.Done():
                                return
                        default:
                        }

                        anyType, err := stream.Recv()
                        if err != nil {
                                if !errors.Is(err, io.EOF) && !errors.Is(err, context.Canceled) {
                                        log.G(ctx).WithError(err).Error("send byte stream ended without EOF")
                                }
                                return
                        }
                        i, err := typeurl.UnmarshalAny(anyType)
                        if err != nil {
                                log.G(ctx).WithError(err).Error("failed to unmarshal stream object")
                                continue
                        }
                        switch v := i.(type) {
                        case *transferapi.WindowUpdate:
                                atomic.AddInt32(&wbs.remaining, v.Update)
                                select {
                                case <-ctx.Done():
                                        return
                                case wbs.updated <- struct{}{}:
                                default:
                                        // Don't block if no writes are waiting
                                }
                        default:
                                log.G(ctx).Errorf("unexpected stream object of type %T", i)
                        }
                }
        }()

        return wbs
}

type writeByteStream struct {
        ctx       context.Context
        stream    streaming.Stream
        remaining int32
        updated   chan struct{}
}

func (wbs *writeByteStream) Write(p []byte) (n int, err error) {
        for len(p) > 0 {
                remaining := atomic.LoadInt32(&wbs.remaining)
                if remaining == 0 {
                        // Don't wait for window update since there are remaining
                        select {
                        case <-wbs.ctx.Done():
                                // TODO: Send error message on stream before close to allow remote side to return error
                                err = io.ErrShortWrite
                                return
                        case <-wbs.updated:
                                continue
                        }
                }
                var max int32 = maxRead
                if max > int32(len(p)) {
                        max = int32(len(p))
                }
                if max > remaining {
                        max = remaining
                }
                // TODO: continue
                // remaining = remaining - int32(n)

                data := &transferapi.Data{
                        Data: p[:max],
                }
                var anyType typeurl.Any
                anyType, err = typeurl.MarshalAny(data)
                if err != nil {
                        log.G(wbs.ctx).WithError(err).Errorf("failed to marshal data for send")
                        // TODO: Send error message on stream before close to allow remote side to return error
                        return
                }
                if err = wbs.stream.Send(anyType); err != nil {
                        log.G(wbs.ctx).WithError(err).Errorf("send failed")
                        return
                }
                n += int(max)
                p = p[max:]
                atomic.AddInt32(&wbs.remaining, -1*max)
        }
        return
}

func (wbs *writeByteStream) Close() error {
        return wbs.stream.Close()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package transfer

import (
        "context"
        "io"

        ocispec "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
)

type Transferrer interface {
        Transfer(ctx context.Context, source interface{}, destination interface{}, opts ...Opt) error
}

type ImageResolver interface {
        Resolve(ctx context.Context) (name string, desc ocispec.Descriptor, err error)
}

type ImageFetcher interface {
        ImageResolver

        Fetcher(ctx context.Context, ref string) (Fetcher, error)
}

type ImagePusher interface {
        Pusher(context.Context, ocispec.Descriptor) (Pusher, error)
}

type Fetcher interface {
        Fetch(context.Context, ocispec.Descriptor) (io.ReadCloser, error)
}

type Pusher interface {
        Push(context.Context, ocispec.Descriptor) (content.Writer, error)
}

// ImageFilterer is used to filter out child objects of an image
type ImageFilterer interface {
        ImageFilter(images.HandlerFunc, content.Store) images.HandlerFunc
}

// ImageStorer is a type which is capable of storing images for
// the provided descriptor. The descriptor may be any type of manifest
// including an index with multiple image references.
type ImageStorer interface {
        Store(context.Context, ocispec.Descriptor, images.Store) ([]images.Image, error)
}

// ImageGetter is type which returns an image from an image store
type ImageGetter interface {
        Get(context.Context, images.Store) (images.Image, error)
}

// ImageLookup is a type which returns images from an image store
// based on names or prefixes
type ImageLookup interface {
        Lookup(context.Context, images.Store) ([]images.Image, error)
}

// ImageExporter exports images to a writer
type ImageExporter interface {
        Export(context.Context, content.Store, []images.Image) error
}

// ImageImporter imports an image into a content store
type ImageImporter interface {
        Import(context.Context, content.Store) (ocispec.Descriptor, error)
}

// ImageImportStreamer returns an import streamer based on OCI or
// Docker image tar archives. The stream should be a raw tar stream
// and without compression.
type ImageImportStreamer interface {
        ImportStream(context.Context) (io.Reader, string, error)
}

type ImageExportStreamer interface {
        ExportStream(context.Context) (io.WriteCloser, string, error)
}

type ImageUnpacker interface {
        UnpackPlatforms() []UnpackConfiguration
}

// UnpackConfiguration specifies the platform and snapshotter to use for resolving
// the unpack Platform, if snapshotter is not specified the platform default will
// be used.
type UnpackConfiguration struct {
        Platform    ocispec.Platform
        Snapshotter string
}

type ProgressFunc func(Progress)

type Config struct {
        Progress ProgressFunc
}

type Opt func(*Config)

func WithProgress(f ProgressFunc) Opt {
        return func(opts *Config) {
                opts.Progress = f
        }
}

// Progress is used to represent a particular progress event or incremental
// update for the provided named object. The parents represent the names of
// the objects which initiated the progress for the provided named object.
// The name and what object it represents is determined by the implementation.
type Progress struct {
        Event    string
        Name     string
        Parents  []string
        Progress int64
        Total    int64
        Desc     *ocispec.Descriptor // since containerd v2.0
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package unpack

import (
        "context"
        "crypto/rand"
        "encoding/base64"
        "encoding/json"
        "errors"
        "fmt"
        "strconv"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/cleanup"
        "github.com/containerd/containerd/v2/internal/kmutex"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "golang.org/x/sync/errgroup"
        "golang.org/x/sync/semaphore"
)

const (
        labelSnapshotRef = "containerd.io/snapshot.ref"
        unpackSpanPrefix = "pkg.unpack.unpacker"
)

// Result returns information about the unpacks which were completed.
type Result struct {
        Unpacks int
}

type unpackerConfig struct {
        platforms []*Platform

        content content.Store

        limiter               *semaphore.Weighted
        duplicationSuppressor kmutex.KeyedLocker
}

// Platform represents a platform-specific unpack configuration which includes
// the platform matcher as well as snapshotter and applier.
type Platform struct {
        Platform platforms.Matcher

        SnapshotterKey string
        Snapshotter    snapshots.Snapshotter
        SnapshotOpts   []snapshots.Opt

        Applier   diff.Applier
        ApplyOpts []diff.ApplyOpt
}

type UnpackerOpt func(*unpackerConfig) error

func WithUnpackPlatform(u Platform) UnpackerOpt {
        return UnpackerOpt(func(c *unpackerConfig) error {
                if u.Platform == nil {
                        u.Platform = platforms.All
                }
                if u.Snapshotter == nil {
                        return fmt.Errorf("snapshotter must be provided to unpack")
                }
                if u.SnapshotterKey == "" {
                        if s, ok := u.Snapshotter.(fmt.Stringer); ok {
                                u.SnapshotterKey = s.String()
                        } else {
                                u.SnapshotterKey = "unknown"
                        }
                }
                if u.Applier == nil {
                        return fmt.Errorf("applier must be provided to unpack")
                }

                c.platforms = append(c.platforms, &u)

                return nil
        })
}

func WithLimiter(l *semaphore.Weighted) UnpackerOpt {
        return UnpackerOpt(func(c *unpackerConfig) error {
                c.limiter = l
                return nil
        })
}

func WithDuplicationSuppressor(d kmutex.KeyedLocker) UnpackerOpt {
        return UnpackerOpt(func(c *unpackerConfig) error {
                c.duplicationSuppressor = d
                return nil
        })
}

// Unpacker unpacks images by hooking into the image handler process.
// Unpacks happen in the backgrounds and waited on to complete.
type Unpacker struct {
        unpackerConfig

        unpacks int32
        ctx     context.Context
        eg      *errgroup.Group
}

// NewUnpacker creates a new instance of the unpacker which can be used to wrap an
// image handler and unpack in parallel to handling. The unpacker will handle
// calling the block handlers when they are needed by the unpack process.
func NewUnpacker(ctx context.Context, cs content.Store, opts ...UnpackerOpt) (*Unpacker, error) {
        eg, ctx := errgroup.WithContext(ctx)

        u := &Unpacker{
                unpackerConfig: unpackerConfig{
                        content:               cs,
                        duplicationSuppressor: kmutex.NewNoop(),
                },
                ctx: ctx,
                eg:  eg,
        }
        for _, opt := range opts {
                if err := opt(&u.unpackerConfig); err != nil {
                        return nil, err
                }
        }
        if len(u.platforms) == 0 {
                return nil, fmt.Errorf("no unpack platforms defined: %w", errdefs.ErrInvalidArgument)
        }
        return u, nil
}

// Unpack wraps an image handler to filter out blob handling and scheduling them
// during the unpack process. When an image config is encountered, the unpack
// process will be started in a goroutine.
func (u *Unpacker) Unpack(h images.Handler) images.Handler {
        var (
                lock   sync.Mutex
                layers = map[digest.Digest][]ocispec.Descriptor{}
        )
        return images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                ctx, span := tracing.StartSpan(ctx, tracing.Name(unpackSpanPrefix, "UnpackHandler"))
                defer span.End()
                span.SetAttributes(
                        tracing.Attribute("descriptor.media.type", desc.MediaType),
                        tracing.Attribute("descriptor.digest", desc.Digest.String()))
                unlock, err := u.lockBlobDescriptor(ctx, desc)
                if err != nil {
                        return nil, err
                }
                children, err := h.Handle(ctx, desc)
                unlock()
                if err != nil {
                        return children, err
                }

                if images.IsManifestType(desc.MediaType) {
                        var nonLayers []ocispec.Descriptor
                        var manifestLayers []ocispec.Descriptor
                        // Split layers from non-layers, layers will be handled after
                        // the config
                        for i, child := range children {
                                span.SetAttributes(
                                        tracing.Attribute("descriptor.child."+strconv.Itoa(i), []string{child.MediaType, child.Digest.String()}),
                                )
                                if images.IsLayerType(child.MediaType) {
                                        manifestLayers = append(manifestLayers, child)
                                } else {
                                        nonLayers = append(nonLayers, child)
                                }
                        }

                        lock.Lock()
                        for _, nl := range nonLayers {
                                layers[nl.Digest] = manifestLayers
                        }
                        lock.Unlock()

                        children = nonLayers
                } else if images.IsConfigType(desc.MediaType) {
                        lock.Lock()
                        l := layers[desc.Digest]
                        lock.Unlock()
                        if len(l) > 0 {
                                u.eg.Go(func() error {
                                        return u.unpack(h, desc, l)
                                })
                        }
                }
                return children, nil
        })
}

// Wait waits for any ongoing unpack processes to complete then will return
// the result.
func (u *Unpacker) Wait() (Result, error) {
        if err := u.eg.Wait(); err != nil {
                return Result{}, err
        }
        return Result{
                Unpacks: int(u.unpacks),
        }, nil
}

func (u *Unpacker) unpack(
        h images.Handler,
        config ocispec.Descriptor,
        layers []ocispec.Descriptor,
) error {
        ctx := u.ctx
        ctx, layerSpan := tracing.StartSpan(ctx, tracing.Name(unpackSpanPrefix, "unpack"))
        defer layerSpan.End()
        unpackStart := time.Now()
        p, err := content.ReadBlob(ctx, u.content, config)
        if err != nil {
                return err
        }

        var i ocispec.Image
        if err := json.Unmarshal(p, &i); err != nil {
                return fmt.Errorf("unmarshal image config: %w", err)
        }
        diffIDs := i.RootFS.DiffIDs
        if len(layers) != len(diffIDs) {
                return fmt.Errorf("number of layers and diffIDs don't match: %d != %d", len(layers), len(diffIDs))
        }

        // TODO: Support multiple unpacks rather than just first match
        var unpack *Platform

        imgPlatform := platforms.Normalize(i.Platform)
        for _, up := range u.platforms {
                if up.Platform.Match(imgPlatform) {
                        unpack = up
                        break
                }
        }

        if unpack == nil {
                log.G(ctx).WithField("image", config.Digest).WithField("platform", platforms.Format(imgPlatform)).Debugf("unpacker does not support platform, only fetching layers")
                return u.fetch(ctx, h, layers, nil)
        }

        atomic.AddInt32(&u.unpacks, 1)

        var (
                sn = unpack.Snapshotter
                a  = unpack.Applier
                cs = u.content

                chain []digest.Digest

                fetchOffset int
                fetchC      []chan struct{}
                fetchErr    chan error
        )

        // If there is an early return, ensure any ongoing
        // fetches get their context cancelled
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()

        doUnpackFn := func(i int, desc ocispec.Descriptor) error {
                parent := identity.ChainID(chain)
                chain = append(chain, diffIDs[i])
                chainID := identity.ChainID(chain).String()

                unlock, err := u.lockSnChainID(ctx, chainID, unpack.SnapshotterKey)
                if err != nil {
                        return err
                }
                defer unlock()

                if _, err := sn.Stat(ctx, chainID); err == nil {
                        // no need to handle
                        return nil
                } else if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to stat snapshot %s: %w", chainID, err)
                }

                // inherits annotations which are provided as snapshot labels.
                snapshotLabels := snapshots.FilterInheritedLabels(desc.Annotations)
                if snapshotLabels == nil {
                        snapshotLabels = make(map[string]string)
                }
                snapshotLabels[labelSnapshotRef] = chainID

                var (
                        key    string
                        mounts []mount.Mount
                        opts   = append(unpack.SnapshotOpts, snapshots.WithLabels(snapshotLabels))
                )

                for try := 1; try <= 3; try++ {
                        // Prepare snapshot with from parent, label as root
                        key = fmt.Sprintf(snapshots.UnpackKeyFormat, uniquePart(), chainID)
                        mounts, err = sn.Prepare(ctx, key, parent.String(), opts...)
                        if err != nil {
                                if errdefs.IsAlreadyExists(err) {
                                        if _, err := sn.Stat(ctx, chainID); err != nil {
                                                if !errdefs.IsNotFound(err) {
                                                        return fmt.Errorf("failed to stat snapshot %s: %w", chainID, err)
                                                }
                                                // Try again, this should be rare, log it
                                                log.G(ctx).WithField("key", key).WithField("chainid", chainID).Debug("extraction snapshot already exists, chain id not found")
                                        } else {
                                                // no need to handle, snapshot now found with chain id
                                                return nil
                                        }
                                } else {
                                        return fmt.Errorf("failed to prepare extraction snapshot %q: %w", key, err)
                                }
                        } else {
                                break
                        }
                }
                if err != nil {
                        return fmt.Errorf("unable to prepare extraction snapshot: %w", err)
                }

                // Abort the snapshot if commit does not happen
                abort := func(ctx context.Context) {
                        if err := sn.Remove(ctx, key); err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to cleanup %q", key)
                        }
                }

                if fetchErr == nil {
                        fetchErr = make(chan error, 1)
                        fetchOffset = i
                        fetchC = make([]chan struct{}, len(layers)-fetchOffset)
                        for i := range fetchC {
                                fetchC[i] = make(chan struct{})
                        }

                        go func(i int) {
                                err := u.fetch(ctx, h, layers[i:], fetchC)
                                if err != nil {
                                        fetchErr <- err
                                }
                                close(fetchErr)
                        }(i)
                }

                select {
                case <-ctx.Done():
                        cleanup.Do(ctx, abort)
                        return ctx.Err()
                case err := <-fetchErr:
                        if err != nil {
                                cleanup.Do(ctx, abort)
                                return err
                        }
                case <-fetchC[i-fetchOffset]:
                }

                diff, err := a.Apply(ctx, desc, mounts, unpack.ApplyOpts...)
                if err != nil {
                        cleanup.Do(ctx, abort)
                        return fmt.Errorf("failed to extract layer %s: %w", diffIDs[i], err)
                }
                if diff.Digest != diffIDs[i] {
                        cleanup.Do(ctx, abort)
                        return fmt.Errorf("wrong diff id calculated on extraction %q", diffIDs[i])
                }

                if err = sn.Commit(ctx, chainID, key, opts...); err != nil {
                        cleanup.Do(ctx, abort)
                        if errdefs.IsAlreadyExists(err) {
                                return nil
                        }
                        return fmt.Errorf("failed to commit snapshot %s: %w", key, err)
                }

                // Set the uncompressed label after the uncompressed
                // digest has been verified through apply.
                cinfo := content.Info{
                        Digest: desc.Digest,
                        Labels: map[string]string{
                                labels.LabelUncompressed: diff.Digest.String(),
                        },
                }
                if _, err := cs.Update(ctx, cinfo, "labels."+labels.LabelUncompressed); err != nil {
                        return err
                }
                return nil
        }

        for i, desc := range layers {
                _, layerSpan := tracing.StartSpan(ctx, tracing.Name(unpackSpanPrefix, "unpackLayer"))
                unpackLayerStart := time.Now()
                layerSpan.SetAttributes(
                        tracing.Attribute("layer.media.type", desc.MediaType),
                        tracing.Attribute("layer.media.size", desc.Size),
                        tracing.Attribute("layer.media.digest", desc.Digest.String()),
                )
                if err := doUnpackFn(i, desc); err != nil {
                        layerSpan.SetStatus(err)
                        layerSpan.End()
                        return err
                }
                layerSpan.End()
                log.G(ctx).WithFields(log.Fields{
                        "layer":    desc.Digest,
                        "duration": time.Since(unpackLayerStart),
                }).Debug("layer unpacked")
        }

        chainID := identity.ChainID(chain).String()
        cinfo := content.Info{
                Digest: config.Digest,
                Labels: map[string]string{
                        fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", unpack.SnapshotterKey): chainID,
                },
        }
        _, err = cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", unpack.SnapshotterKey))
        if err != nil {
                return err
        }
        log.G(ctx).WithFields(log.Fields{
                "config":   config.Digest,
                "chainID":  chainID,
                "duration": time.Since(unpackStart),
        }).Debug("image unpacked")

        return nil
}

func (u *Unpacker) fetch(ctx context.Context, h images.Handler, layers []ocispec.Descriptor, done []chan struct{}) error {
        eg, ctx2 := errgroup.WithContext(ctx)
        for i, desc := range layers {
                ctx2, layerSpan := tracing.StartSpan(ctx2, tracing.Name(unpackSpanPrefix, "fetchLayer"))
                layerSpan.SetAttributes(
                        tracing.Attribute("layer.media.type", desc.MediaType),
                        tracing.Attribute("layer.media.size", desc.Size),
                        tracing.Attribute("layer.media.digest", desc.Digest.String()),
                )
                desc := desc
                var ch chan struct{}
                if done != nil {
                        ch = done[i]
                }

                if err := u.acquire(ctx); err != nil {
                        return err
                }

                eg.Go(func() error {
                        defer layerSpan.End()

                        unlock, err := u.lockBlobDescriptor(ctx2, desc)
                        if err != nil {
                                u.release()
                                return err
                        }

                        _, err = h.Handle(ctx2, desc)

                        unlock()
                        u.release()

                        if err != nil && !errors.Is(err, images.ErrSkipDesc) {
                                return err
                        }
                        if ch != nil {
                                close(ch)
                        }

                        return nil
                })
        }

        return eg.Wait()
}

func (u *Unpacker) acquire(ctx context.Context) error {
        if u.limiter == nil {
                return nil
        }
        return u.limiter.Acquire(ctx, 1)
}

func (u *Unpacker) release() {
        if u.limiter == nil {
                return
        }
        u.limiter.Release(1)
}

func (u *Unpacker) lockSnChainID(ctx context.Context, chainID, snapshotter string) (func(), error) {
        key := u.makeChainIDKeyWithSnapshotter(chainID, snapshotter)

        if err := u.duplicationSuppressor.Lock(ctx, key); err != nil {
                return nil, err
        }
        return func() {
                u.duplicationSuppressor.Unlock(key)
        }, nil
}

func (u *Unpacker) lockBlobDescriptor(ctx context.Context, desc ocispec.Descriptor) (func(), error) {
        key := u.makeBlobDescriptorKey(desc)

        if err := u.duplicationSuppressor.Lock(ctx, key); err != nil {
                return nil, err
        }
        return func() {
                u.duplicationSuppressor.Unlock(key)
        }, nil
}

func (u *Unpacker) makeChainIDKeyWithSnapshotter(chainID, snapshotter string) string {
        return fmt.Sprintf("sn://%s/%v", snapshotter, chainID)
}

func (u *Unpacker) makeBlobDescriptorKey(desc ocispec.Descriptor) string {
        return fmt.Sprintf("blob://%v", desc.Digest)
}

func uniquePart() string {
        t := time.Now()
        var b [3]byte
        // Ignore read failures, just decreases uniqueness
        rand.Read(b[:])
        return fmt.Sprintf("%d-%s", t.Nanosecond(), base64.URLEncoding.EncodeToString(b[:]))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package providing utilies to help cleanup
package cleanup

import (
        "context"
        "time"
)

type clearCancel struct {
        context.Context
}

func (cc clearCancel) Deadline() (deadline time.Time, ok bool) {
        return
}

func (cc clearCancel) Done() <-chan struct{} {
        return nil
}

func (cc clearCancel) Err() error {
        return nil
}

// Background creates a new context which clears out the parent errors
func Background(ctx context.Context) context.Context {
        return clearCancel{ctx}
}

// Do runs the provided function with a context in which the
// errors are cleared out and will timeout after 10 seconds.
func Do(ctx context.Context, do func(context.Context)) {
        ctx, cancel := context.WithTimeout(clearCancel{ctx}, 10*time.Second)
        do(ctx)
        cancel()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package annotations

import (
        customopts "github.com/containerd/containerd/v2/internal/cri/opts"
        "github.com/containerd/containerd/v2/pkg/oci"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ContainerType values
// Following OCI annotations are used by katacontainers now.
// We'll switch to standard secure pod API after it is defined in CRI.
const (
        // ContainerTypeSandbox represents a pod sandbox container
        ContainerTypeSandbox = "sandbox"

        // ContainerTypeContainer represents a container running within a pod
        ContainerTypeContainer = "container"

        // ContainerType is the container type (sandbox or container) annotation
        ContainerType = "io.kubernetes.cri.container-type"

        // SandboxID is the sandbox ID annotation
        SandboxID = "io.kubernetes.cri.sandbox-id"

        // SandboxCPU annotations are based on the initial CPU configuration for the sandbox. This is calculated as the
        // sum of container CPU resources, optionally provided by Kubelet (introduced  in 1.23) as part of the PodSandboxConfig
        SandboxCPUPeriod = "io.kubernetes.cri.sandbox-cpu-period"
        SandboxCPUQuota  = "io.kubernetes.cri.sandbox-cpu-quota"
        SandboxCPUShares = "io.kubernetes.cri.sandbox-cpu-shares"

        // SandboxMemory is the initial amount of memory associated with this sandbox. This is calculated as the sum
        // of container memory, optionally provided by Kubelet (introduced in 1.23) as part of the PodSandboxConfig.
        SandboxMem = "io.kubernetes.cri.sandbox-memory"

        // SandboxLogDir is the pod log directory annotation.
        // If the sandbox needs to generate any log, it will put it into this directory.
        // Kubelet will be responsible for:
        // 1) Monitoring the disk usage of the log, and including it as part of the pod
        // ephemeral storage usage.
        // 2) Cleaning up the logs when the pod is deleted.
        // NOTE: Kubelet is not responsible for rotating the logs.
        SandboxLogDir = "io.kubernetes.cri.sandbox-log-directory"

        // UntrustedWorkload is the sandbox annotation for untrusted workload. Untrusted
        // workload can only run on dedicated runtime for untrusted workload.
        UntrustedWorkload = "io.kubernetes.cri.untrusted-workload"

        // SandboxNamespace is the name of the namespace of the sandbox (pod)
        SandboxNamespace = "io.kubernetes.cri.sandbox-namespace"

        // SandboxUID is the uid of the sandbox (pod) passed to CRI via RunPodSanbox,
        // this field is useful for linking the uid created by the CRI client (e.g. kubelet)
        // to the internal Sandbox.ID created by the containerd sandbox service
        SandboxUID = "io.kubernetes.cri.sandbox-uid"

        // SandboxName is the name of the sandbox (pod)
        SandboxName = "io.kubernetes.cri.sandbox-name"

        // ContainerName is the name of the container in the pod
        ContainerName = "io.kubernetes.cri.container-name"

        // ImageName is the name of the image used to create the container
        ImageName = "io.kubernetes.cri.image-name"

        // SandboxImageName is the name of the sandbox image
        SandboxImageName = "io.kubernetes.cri.podsandbox.image-name"

        // PodAnnotations are the annotations of the pod
        PodAnnotations = "io.kubernetes.cri.pod-annotations"

        // RuntimeHandler an experimental annotation key for getting runtime handler from pod annotations.
        // See https://github.com/containerd/containerd/issues/6657 and https://github.com/containerd/containerd/pull/6899 for details.
        // The value of this annotation should be the runtime for sandboxes.
        // e.g. for [plugins.cri.containerd.runtimes.runc] runtime config, this value should be runc
        // TODO: we should deprecate this annotation as soon as kubelet supports passing RuntimeHandler from PullImageRequest
        RuntimeHandler = "io.containerd.cri.runtime-handler"

        // WindowsHostProcess is used by hcsshim to identify windows pods that are running HostProcesses
        WindowsHostProcess = "microsoft.com/hostprocess-container"
)

// DefaultCRIAnnotations are the default set of CRI annotations to
// pass to sandboxes and containers.
func DefaultCRIAnnotations(
        sandboxID string,
        containerName string,
        imageName string,
        config *runtime.PodSandboxConfig,
        sandbox bool,
) []oci.SpecOpts {
        opts := []oci.SpecOpts{
                customopts.WithAnnotation(SandboxID, sandboxID),
                customopts.WithAnnotation(SandboxNamespace, config.GetMetadata().GetNamespace()),
                customopts.WithAnnotation(SandboxUID, config.GetMetadata().GetUid()),
                customopts.WithAnnotation(SandboxName, config.GetMetadata().GetName()),
        }
        ctrType := ContainerTypeContainer
        if sandbox {
                ctrType = ContainerTypeSandbox
                // Sandbox log dir and sandbox image name get passed for sandboxes, the other metadata always
                // gets sent however.
                opts = append(
                        opts,
                        customopts.WithAnnotation(SandboxLogDir, config.GetLogDirectory()),
                        customopts.WithAnnotation(SandboxImageName, imageName),
                )
        } else {
                // Image name and container name get passed for containers.
                opts = append(
                        opts,
                        customopts.WithAnnotation(ContainerName, containerName),
                        customopts.WithAnnotation(ImageName, imageName),
                )
        }
        return append(opts, customopts.WithAnnotation(ContainerType, ctrType))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
Copyright 2015 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package bandwidth

import (
        "github.com/containerd/errdefs"

        "k8s.io/apimachinery/pkg/api/resource"
)

// FakeShaper provides an implementation of the bandwidth.Shaper.
// Beware this is implementation has no features besides Reset and GetCIDRs.
type FakeShaper struct {
        CIDRs      []string
        ResetCIDRs []string
}

// Limit is not implemented
func (f *FakeShaper) Limit(cidr string, egress, ingress *resource.Quantity) error {
        return errdefs.ErrNotImplemented
}

// Reset appends a particular CIDR to the set of ResetCIDRs being managed by this shaper
func (f *FakeShaper) Reset(cidr string) error {
        f.ResetCIDRs = append(f.ResetCIDRs, cidr)
        return nil
}

// ReconcileInterface is not implemented
func (f *FakeShaper) ReconcileInterface() error {
        return errdefs.ErrNotImplemented
}

// ReconcileCIDR is not implemented
func (f *FakeShaper) ReconcileCIDR(cidr string, egress, ingress *resource.Quantity) error {
        return errdefs.ErrNotImplemented
}

// GetCIDRs returns the set of CIDRs that are being managed by this shaper
func (f *FakeShaper) GetCIDRs() ([]string, error) {
        return f.CIDRs, nil
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
Copyright 2015 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package bandwidth

import (
        "bufio"
        "bytes"
        "encoding/hex"
        "fmt"
        "net"
        "regexp"
        "strings"

        "k8s.io/apimachinery/pkg/api/resource"
        "k8s.io/apimachinery/pkg/util/sets"
        "k8s.io/utils/exec"

        "k8s.io/klog/v2"
)

var (
        classShowMatcher      = regexp.MustCompile(`class htb (1:\d+)`)
        classAndHandleMatcher = regexp.MustCompile(`filter parent 1:.*fh (\d+::\d+).*flowid (\d+:\d+)`)
)

// tcShaper provides an implementation of the Shaper interface on Linux using the 'tc' tool.
// In general, using this requires that the caller posses the NET_CAP_ADMIN capability, though if you
// do this within an container, it only requires the NS_CAPABLE capability for manipulations to that
// container's network namespace.
// Uses the hierarchical token bucket queuing discipline (htb), this requires Linux 2.4.20 or newer
// or a custom kernel with that queuing discipline backported.
type tcShaper struct {
        e     exec.Interface
        iface string
}

// NewTCShaper makes a new tcShaper for the given interface
func NewTCShaper(iface string) Shaper {
        shaper := &tcShaper{
                e:     exec.New(),
                iface: iface,
        }
        return shaper
}

func (t *tcShaper) execAndLog(cmdStr string, args ...string) error {
        klog.V(6).Infof("Running: %s %s", cmdStr, strings.Join(args, " "))
        cmd := t.e.Command(cmdStr, args...)
        out, err := cmd.CombinedOutput()
        klog.V(6).Infof("Output from tc: %s", string(out))
        return err
}

func (t *tcShaper) nextClassID() (int, error) {
        data, err := t.e.Command("tc", "class", "show", "dev", t.iface).CombinedOutput()
        if err != nil {
                return -1, err
        }

        scanner := bufio.NewScanner(bytes.NewBuffer(data))
        classes := sets.Set[string]{}
        for scanner.Scan() {
                line := strings.TrimSpace(scanner.Text())
                // skip empty lines
                if len(line) == 0 {
                        continue
                }
                // expected tc line:
                // class htb 1:1 root prio 0 rate 1000Kbit ceil 1000Kbit burst 1600b cburst 1600b
                matches := classShowMatcher.FindStringSubmatch(line)
                if len(matches) != 2 {
                        return -1, fmt.Errorf("unexpected output from tc: %s (%v)", scanner.Text(), matches)
                }
                classes.Insert(matches[1])
        }

        // Make sure it doesn't go forever
        for nextClass := 1; nextClass < 10000; nextClass++ {
                if !classes.Has(fmt.Sprintf("1:%d", nextClass)) {
                        return nextClass, nil
                }
        }
        // This should really never happen
        return -1, fmt.Errorf("exhausted class space, please try again")
}

// Convert a CIDR from text to a hex representation
// Strips any masked parts of the IP, so 1.2.3.4/16 becomes hex(1.2.0.0)/ffffffff
func hexCIDR(cidr string) (string, error) {
        ip, ipnet, err := net.ParseCIDR(cidr)
        if err != nil {
                return "", err
        }
        ip = ip.Mask(ipnet.Mask)
        hexIP := hex.EncodeToString([]byte(ip))
        hexMask := ipnet.Mask.String()
        return hexIP + "/" + hexMask, nil
}

// Convert a CIDR from hex representation to text, opposite of the above.
func asciiCIDR(cidr string) (string, error) {
        parts := strings.Split(cidr, "/")
        if len(parts) != 2 {
                return "", fmt.Errorf("unexpected CIDR format: %s", cidr)
        }
        ipData, err := hex.DecodeString(parts[0])
        if err != nil {
                return "", err
        }
        ip := net.IP(ipData)

        maskData, err := hex.DecodeString(parts[1])
        if err != nil {
                return "", err
        }
        mask := net.IPMask(maskData)
        size, _ := mask.Size()

        return fmt.Sprintf("%s/%d", ip.String(), size), nil
}

func (t *tcShaper) findCIDRClass(cidr string) (classAndHandleList [][]string, found bool, err error) {
        data, err := t.e.Command("tc", "filter", "show", "dev", t.iface).CombinedOutput()
        if err != nil {
                return classAndHandleList, false, err
        }

        hex, err := hexCIDR(cidr)
        if err != nil {
                return classAndHandleList, false, err
        }
        spec := fmt.Sprintf("match %s", hex)

        scanner := bufio.NewScanner(bytes.NewBuffer(data))
        filter := ""
        for scanner.Scan() {
                line := strings.TrimSpace(scanner.Text())
                if len(line) == 0 {
                        continue
                }
                if strings.HasPrefix(line, "filter") {
                        filter = line
                        continue
                }
                if strings.Contains(line, spec) {
                        // expected tc line:
                        // `filter parent 1: protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1` (old version) or
                        // `filter parent 1: protocol ip pref 1 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 not_in_hw` (new version)
                        matches := classAndHandleMatcher.FindStringSubmatch(filter)
                        if len(matches) != 3 {
                                return classAndHandleList, false, fmt.Errorf("unexpected output from tc: %s %d (%v)", filter, len(matches), matches)
                        }
                        resultTmp := []string{matches[2], matches[1]}
                        classAndHandleList = append(classAndHandleList, resultTmp)
                }
        }
        if len(classAndHandleList) > 0 {
                return classAndHandleList, true, nil
        }
        return classAndHandleList, false, nil
}

func makeKBitString(rsrc *resource.Quantity) string {
        return fmt.Sprintf("%dkbit", (rsrc.Value() / 1000))
}

func (t *tcShaper) makeNewClass(rate string) (int, error) {
        class, err := t.nextClassID()
        if err != nil {
                return -1, err
        }
        if err := t.execAndLog("tc", "class", "add",
                "dev", t.iface,
                "parent", "1:",
                "classid", fmt.Sprintf("1:%d", class),
                "htb", "rate", rate); err != nil {
                return -1, err
        }
        return class, nil
}

func (t *tcShaper) Limit(cidr string, upload, download *resource.Quantity) (err error) {
        var downloadClass, uploadClass int
        if download != nil {
                if downloadClass, err = t.makeNewClass(makeKBitString(download)); err != nil {
                        return err
                }
                if err := t.execAndLog("tc", "filter", "add",
                        "dev", t.iface,
                        "protocol", "ip",
                        "parent", "1:0",
                        "prio", "1", "u32",
                        "match", "ip", "dst", cidr,
                        "flowid", fmt.Sprintf("1:%d", downloadClass)); err != nil {
                        return err
                }
        }
        if upload != nil {
                if uploadClass, err = t.makeNewClass(makeKBitString(upload)); err != nil {
                        return err
                }
                if err := t.execAndLog("tc", "filter", "add",
                        "dev", t.iface,
                        "protocol", "ip",
                        "parent", "1:0",
                        "prio", "1", "u32",
                        "match", "ip", "src", cidr,
                        "flowid", fmt.Sprintf("1:%d", uploadClass)); err != nil {
                        return err
                }
        }
        return nil
}

// tests to see if an interface exists, if it does, return true and the status line for the interface
// returns false, "", <err> if an error occurs.
func (t *tcShaper) interfaceExists() (bool, string, error) {
        data, err := t.e.Command("tc", "qdisc", "show", "dev", t.iface).CombinedOutput()
        if err != nil {
                return false, "", err
        }
        value := strings.TrimSpace(string(data))
        if len(value) == 0 {
                return false, "", nil
        }
        // Newer versions of tc and/or the kernel return the following instead of nothing:
        // qdisc noqueue 0: root refcnt 2
        fields := strings.Fields(value)
        if len(fields) > 1 && fields[1] == "noqueue" {
                return false, "", nil
        }
        return true, value, nil
}

func (t *tcShaper) ReconcileCIDR(cidr string, upload, download *resource.Quantity) error {
        _, found, err := t.findCIDRClass(cidr)
        if err != nil {
                return err
        }
        if !found {
                return t.Limit(cidr, upload, download)
        }
        // TODO: actually check bandwidth limits here
        return nil
}

func (t *tcShaper) ReconcileInterface() error {
        exists, output, err := t.interfaceExists()
        if err != nil {
                return err
        }
        if !exists {
                klog.V(4).Info("Didn't find bandwidth interface, creating")
                return t.initializeInterface()
        }
        fields := strings.Split(output, " ")
        if len(fields) < 12 || fields[1] != "htb" || fields[2] != "1:" {
                if err := t.deleteInterface(fields[2]); err != nil {
                        return err
                }
                return t.initializeInterface()
        }
        return nil
}

func (t *tcShaper) initializeInterface() error {
        return t.execAndLog("tc", "qdisc", "add", "dev", t.iface, "root", "handle", "1:", "htb", "default", "30")
}

func (t *tcShaper) Reset(cidr string) error {
        classAndHandle, found, err := t.findCIDRClass(cidr)
        if err != nil {
                return err
        }
        if !found {
                return fmt.Errorf("Failed to find cidr: %s on interface: %s", cidr, t.iface)
        }
        for i := 0; i < len(classAndHandle); i++ {
                if err := t.execAndLog("tc", "filter", "del",
                        "dev", t.iface,
                        "parent", "1:",
                        "proto", "ip",
                        "prio", "1",
                        "handle", classAndHandle[i][1], "u32"); err != nil {
                        return err
                }
                if err := t.execAndLog("tc", "class", "del",
                        "dev", t.iface,
                        "parent", "1:",
                        "classid", classAndHandle[i][0]); err != nil {
                        return err
                }
        }
        return nil
}

func (t *tcShaper) deleteInterface(class string) error {
        return t.execAndLog("tc", "qdisc", "delete", "dev", t.iface, "root", "handle", class)
}

func (t *tcShaper) GetCIDRs() ([]string, error) {
        data, err := t.e.Command("tc", "filter", "show", "dev", t.iface).CombinedOutput()
        if err != nil {
                return nil, err
        }

        result := []string{}
        scanner := bufio.NewScanner(bytes.NewBuffer(data))
        for scanner.Scan() {
                line := strings.TrimSpace(scanner.Text())
                if len(line) == 0 {
                        continue
                }
                if strings.Contains(line, "match") {
                        parts := strings.Split(line, " ")
                        // expected tc line:
                        // match <cidr> at <number>
                        if len(parts) != 4 {
                                return nil, fmt.Errorf("unexpected output: %v", parts)
                        }
                        cidr, err := asciiCIDR(parts[1])
                        if err != nil {
                                return nil, err
                        }
                        result = append(result, cidr)
                }
        }
        return result, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
Copyright 2015 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package bandwidth

import (
        "fmt"

        "k8s.io/apimachinery/pkg/api/resource"
)

var minRsrc = resource.MustParse("1k")
var maxRsrc = resource.MustParse("1P")

func validateBandwidthIsReasonable(rsrc *resource.Quantity) error {
        if rsrc.Value() < minRsrc.Value() {
                return fmt.Errorf("resource is unreasonably small (< 1kbit)")
        }
        if rsrc.Value() > maxRsrc.Value() {
                return fmt.Errorf("resource is unreasonably large (> 1Pbit)")
        }
        return nil
}

// ExtractPodBandwidthResources extracts the ingress and egress from the given pod annotations
func ExtractPodBandwidthResources(podAnnotations map[string]string) (ingress, egress *resource.Quantity, err error) {
        if podAnnotations == nil {
                return nil, nil, nil
        }
        str, found := podAnnotations["kubernetes.io/ingress-bandwidth"]
        if found {
                ingressValue, err := resource.ParseQuantity(str)
                if err != nil {
                        return nil, nil, err
                }
                ingress = &ingressValue
                if err := validateBandwidthIsReasonable(ingress); err != nil {
                        return nil, nil, err
                }
        }
        str, found = podAnnotations["kubernetes.io/egress-bandwidth"]
        if found {
                egressValue, err := resource.ParseQuantity(str)
                if err != nil {
                        return nil, nil, err
                }
                egress = &egressValue
                if err := validateBandwidthIsReasonable(egress); err != nil {
                        return nil, nil, err
                }
        }
        return ingress, egress, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package config

import (
        "context"
        "errors"
        "fmt"
        "net/url"
        "time"

        "github.com/containerd/log"
        "github.com/pelletier/go-toml/v2"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
        "k8s.io/kubelet/pkg/cri/streaming"

        runhcsoptions "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options"
        runcoptions "github.com/containerd/containerd/api/types/runc/options"
        runtimeoptions "github.com/containerd/containerd/api/types/runtimeoptions/v1"
        "github.com/containerd/containerd/v2/internal/cri/annotations"
        "github.com/containerd/containerd/v2/pkg/deprecation"
        "github.com/containerd/containerd/v2/plugins"
)

const (
        // defaultImagePullProgressTimeoutDuration is the default value of imagePullProgressTimeout.
        //
        // NOTE:
        //
        // This ImagePullProgressTimeout feature is ported from kubelet/dockershim's
        // --image-pull-progress-deadline. The original value is 1m0. Unlike docker
        // daemon, the containerd doesn't have global concurrent download limitation
        // before migrating to Transfer Service. If kubelet runs with concurrent
        // image pull, the node will run under IO pressure. The ImagePull process
        // could be impacted by self, if the target image is large one with a
        // lot of layers. And also both container's writable layers and image's storage
        // share one disk. The ImagePull process commits blob to content store
        // with fsync, which might bring the unrelated files' dirty pages into
        // disk in one transaction [1]. The 1m0 value isn't good enough. Based
        // on #9347 case and kubernetes community's usage [2], the default value
        // is updated to 5m0. If end-user still runs into unexpected cancel,
        // they need to config it based on their environment.
        //
        // [1]: Fast commits for ext4 - https://lwn.net/Articles/842385/
        // [2]: https://github.com/kubernetes/kubernetes/blob/1635c380b26a1d8cc25d36e9feace9797f4bae3c/cluster/gce/util.sh#L882
        defaultImagePullProgressTimeoutDuration = 5 * time.Minute
)

type SandboxControllerMode string

const (
        // ModePodSandbox means use Controller implementation from sbserver podsandbox package.
        // We take this one as a default mode.
        ModePodSandbox SandboxControllerMode = "podsandbox"
        // ModeShim means use whatever Controller implementation provided by shim.
        ModeShim SandboxControllerMode = "shim"
        // DefaultSandboxImage is the default image to use for sandboxes when empty or
        // for default configurations.
        DefaultSandboxImage = "registry.k8s.io/pause:3.9"
        // IOTypeFifo is container io implemented by creating named pipe
        IOTypeFifo = "fifo"
        // IOTypeStreaming is container io implemented by connecting the streaming api to sandbox endpoint
        IOTypeStreaming = "streaming"
)

// Runtime struct to contain the type(ID), engine, and root variables for a default runtime
// and a runtime for untrusted workload.
type Runtime struct {
        // Type is the runtime type to use in containerd e.g. io.containerd.runtime.v1.linux
        Type string `toml:"runtime_type" json:"runtimeType"`
        // Path is an optional field that can be used to overwrite path to a shim runtime binary.
        // When specified, containerd will ignore runtime name field when resolving shim location.
        // Path must be abs.
        Path string `toml:"runtime_path" json:"runtimePath"`
        // PodAnnotations is a list of pod annotations passed to both pod sandbox as well as
        // container OCI annotations.
        PodAnnotations []string `toml:"pod_annotations" json:"PodAnnotations"`
        // ContainerAnnotations is a list of container annotations passed through to the OCI config of the containers.
        // Container annotations in CRI are usually generated by other Kubernetes node components (i.e., not users).
        // Currently, only device plugins populate the annotations.
        ContainerAnnotations []string `toml:"container_annotations" json:"ContainerAnnotations"`
        // Options are config options for the runtime.
        Options map[string]interface{} `toml:"options" json:"options"`
        // PrivilegedWithoutHostDevices overloads the default behaviour for adding host devices to the
        // runtime spec when the container is privileged. Defaults to false.
        PrivilegedWithoutHostDevices bool `toml:"privileged_without_host_devices" json:"privileged_without_host_devices"`
        // PrivilegedWithoutHostDevicesAllDevicesAllowed overloads the default behaviour device allowlisting when
        // to the runtime spec when the container when PrivilegedWithoutHostDevices is already enabled. Requires
        // PrivilegedWithoutHostDevices to be enabled. Defaults to false.
        PrivilegedWithoutHostDevicesAllDevicesAllowed bool `toml:"privileged_without_host_devices_all_devices_allowed" json:"privileged_without_host_devices_all_devices_allowed"`
        // BaseRuntimeSpec is a json file with OCI spec to use as base spec that all container's will be created from.
        BaseRuntimeSpec string `toml:"base_runtime_spec" json:"baseRuntimeSpec"`
        // NetworkPluginConfDir is a directory containing the CNI network information for the runtime class.
        NetworkPluginConfDir string `toml:"cni_conf_dir" json:"cniConfDir"`
        // NetworkPluginMaxConfNum is the max number of plugin config files that will
        // be loaded from the cni config directory by go-cni. Set the value to 0 to
        // load all config files (no arbitrary limit). The legacy default value is 1.
        NetworkPluginMaxConfNum int `toml:"cni_max_conf_num" json:"cniMaxConfNum"`
        // Snapshotter setting snapshotter at runtime level instead of making it as a global configuration.
        // An example use case is to use devmapper or other snapshotters in Kata containers for performance and security
        // while using default snapshotters for operational simplicity.
        // See https://github.com/containerd/containerd/issues/6657 for details.
        Snapshotter string `toml:"snapshotter" json:"snapshotter"`
        // Sandboxer defines which sandbox runtime to use when scheduling pods
        // This features requires the new CRI server implementation (enabled by default in 2.0)
        // shim - means use whatever Controller implementation provided by shim (e.g. use RemoteController).
        // podsandbox - means use Controller implementation from sbserver podsandbox package.
        Sandboxer string `toml:"sandboxer" json:"sandboxer"`
        // IOType defines how containerd transfer the io streams of the container
        // if it is not set, the named pipe will be created for the container
        // we can also set it to "streaming" to create a stream by streaming api,
        // and use it as a channel to transfer the io stream
        IOType string `toml:"io_type" json:"io_type"`
}

// ContainerdConfig contains toml config related to containerd
type ContainerdConfig struct {
        // DefaultRuntimeName is the default runtime name to use from the runtimes table.
        DefaultRuntimeName string `toml:"default_runtime_name" json:"defaultRuntimeName"`

        // Runtimes is a map from CRI RuntimeHandler strings, which specify types of runtime
        // configurations, to the matching configurations.
        Runtimes map[string]Runtime `toml:"runtimes" json:"runtimes"`

        // IgnoreBlockIONotEnabledErrors is a boolean flag to ignore
        // blockio related errors when blockio support has not been
        // enabled.
        IgnoreBlockIONotEnabledErrors bool `toml:"ignore_blockio_not_enabled_errors" json:"ignoreBlockIONotEnabledErrors"`

        // IgnoreRdtNotEnabledErrors is a boolean flag to ignore RDT related errors
        // when RDT support has not been enabled.
        IgnoreRdtNotEnabledErrors bool `toml:"ignore_rdt_not_enabled_errors" json:"ignoreRdtNotEnabledErrors"`
}

// CniConfig contains toml config related to cni
type CniConfig struct {
        // NetworkPluginBinDir is the directory in which the binaries for the plugin is kept.
        NetworkPluginBinDir string `toml:"bin_dir" json:"binDir"`
        // NetworkPluginConfDir is the directory in which the admin places a CNI conf.
        NetworkPluginConfDir string `toml:"conf_dir" json:"confDir"`
        // NetworkPluginMaxConfNum is the max number of plugin config files that will
        // be loaded from the cni config directory by go-cni. Set the value to 0 to
        // load all config files (no arbitrary limit). The legacy default value is 1.
        NetworkPluginMaxConfNum int `toml:"max_conf_num" json:"maxConfNum"`
        // NetworkPluginSetupSerially is a boolean flag to specify whether containerd sets up networks serially
        // if there are multiple CNI plugin config files existing and NetworkPluginMaxConfNum is larger than 1.
        //
        // NOTE: On the Linux platform, containerd provides loopback network
        // configuration by default. There are at least two network plugins.
        // The default value of NetworkPluginSetupSerially is false which means
        // the loopback and eth0 are handled in parallel mode. Since the loopback
        // device is created as the net namespace is created, it's safe to run
        // in parallel mode as the default setting.
        NetworkPluginSetupSerially bool `toml:"setup_serially" json:"setupSerially"`
        // NetworkPluginConfTemplate is the file path of golang template used to generate cni config.
        // When it is set, containerd will get cidr(s) from kubelet to replace {{.PodCIDR}},
        // {{.PodCIDRRanges}} or {{.Routes}} in the template, and write the config into
        // NetworkPluginConfDir.
        // Ideally the cni config should be placed by system admin or cni daemon like calico,
        // weaveworks etc. However, this is useful for the cases when there is no cni daemonset to place cni config.
        // This allowed for very simple generic networking using the Kubernetes built in node pod CIDR IPAM, avoiding the
        // need to fetch the node object through some external process (which has scalability, auth, complexity issues).
        // It is currently heavily used in kubernetes-containerd CI testing
        // NetworkPluginConfTemplate was once deprecated in containerd v1.7.0,
        // but its deprecation was cancelled in v1.7.3.
        NetworkPluginConfTemplate string `toml:"conf_template" json:"confTemplate"`
        // IPPreference specifies the strategy to use when selecting the main IP address for a pod.
        //
        // Options include:
        // * ipv4, "" - (default) select the first ipv4 address
        // * ipv6 - select the first ipv6 address
        // * cni - use the order returned by the CNI plugins, returning the first IP address from the results
        IPPreference string `toml:"ip_pref" json:"ipPref"`
}

// Mirror contains the config related to the registry mirror
type Mirror struct {
        // Endpoints are endpoints for a namespace. CRI plugin will try the endpoints
        // one by one until a working one is found. The endpoint must be a valid url
        // with host specified.
        // The scheme, host and path from the endpoint URL will be used.
        Endpoints []string `toml:"endpoint" json:"endpoint"`
}

// AuthConfig contains the config related to authentication to a specific registry
type AuthConfig struct {
        // Username is the username to login the registry.
        Username string `toml:"username" json:"username"`
        // Password is the password to login the registry.
        Password string `toml:"password" json:"password"`
        // Auth is a base64 encoded string from the concatenation of the username,
        // a colon, and the password.
        Auth string `toml:"auth" json:"auth"`
        // IdentityToken is used to authenticate the user and get
        // an access token for the registry.
        IdentityToken string `toml:"identitytoken" json:"identitytoken"`
}

// Registry is registry settings configured
type Registry struct {
        // ConfigPath is a path to the root directory containing registry-specific
        // configurations.
        // If ConfigPath is set, the rest of the registry specific options are ignored.
        ConfigPath string `toml:"config_path" json:"configPath"`
        // Mirrors are namespace to mirror mapping for all namespaces.
        // This option will not be used when ConfigPath is provided.
        // DEPRECATED: Use ConfigPath instead. Remove in containerd 2.0.
        Mirrors map[string]Mirror `toml:"mirrors" json:"mirrors"`
        // Configs are configs for each registry.
        // The key is the domain name or IP of the registry.
        // DEPRECATED: Use ConfigPath instead.
        Configs map[string]RegistryConfig `toml:"configs" json:"configs"`
        // Auths are registry endpoint to auth config mapping. The registry endpoint must
        // be a valid url with host specified.
        // DEPRECATED: Use ConfigPath instead. Remove in containerd 2.0, supported in 1.x releases.
        Auths map[string]AuthConfig `toml:"auths" json:"auths"`
        // Headers adds additional HTTP headers that get sent to all registries
        Headers map[string][]string `toml:"headers" json:"headers"`
}

// RegistryConfig contains configuration used to communicate with the registry.
type RegistryConfig struct {
        // Auth contains information to authenticate to the registry.
        Auth *AuthConfig `toml:"auth" json:"auth"`
}

// ImageDecryption contains configuration to handling decryption of encrypted container images.
type ImageDecryption struct {
        // KeyModel specifies the trust model of where keys should reside.
        //
        // Details of field usage can be found in:
        // https://github.com/containerd/containerd/tree/main/docs/cri/config.md
        //
        // Details of key models can be found in:
        // https://github.com/containerd/containerd/tree/main/docs/cri/decryption.md
        KeyModel string `toml:"key_model" json:"keyModel"`
}

// ImagePlatform represents the platform to use for an image including the
// snapshotter to use. If snapshotter is not provided, the platform default
// can be assumed. When platform is not provided, the default platform can
// be assumed
type ImagePlatform struct {
        Platform string `toml:"platform" json:"platform"`
        // Snapshotter setting snapshotter at runtime level instead of making it as a global configuration.
        // An example use case is to use devmapper or other snapshotters in Kata containers for performance and security
        // while using default snapshotters for operational simplicity.
        // See https://github.com/containerd/containerd/issues/6657 for details.
        Snapshotter string `toml:"snapshotter" json:"snapshotter"`
}

type ImageConfig struct {
        // Snapshotter is the snapshotter used by containerd.
        Snapshotter string `toml:"snapshotter" json:"snapshotter"`

        // DisableSnapshotAnnotations disables to pass additional annotations (image
        // related information) to snapshotters. These annotations are required by
        // stargz snapshotter (https://github.com/containerd/stargz-snapshotter).
        DisableSnapshotAnnotations bool `toml:"disable_snapshot_annotations" json:"disableSnapshotAnnotations"`

        // DiscardUnpackedLayers is a boolean flag to specify whether to allow GC to
        // remove layers from the content store after successfully unpacking these
        // layers to the snapshotter.
        DiscardUnpackedLayers bool `toml:"discard_unpacked_layers" json:"discardUnpackedLayers"`

        // PinnedImages are images which the CRI plugin uses and should not be
        // removed by the CRI client. The images have a key which can be used
        // by other plugins to lookup the current image name.
        // Image names should be full names including domain and tag
        // Examples:
        //   "sandbox": "k8s.gcr.io/pause:3.9"
        //   "base": "docker.io/library/ubuntu:latest"
        // Migrated from:
        // (PluginConfig).SandboxImage string `toml:"sandbox_image" json:"sandboxImage"`
        PinnedImages map[string]string `toml:"pinned_images" json:"pinned_images"`

        // RuntimePlatforms is map between the runtime and the image platform to
        // use for that runtime. When resolving an image for a runtime, this
        // mapping will be used to select the image for the platform and the
        // snapshotter for unpacking.
        RuntimePlatforms map[string]ImagePlatform `toml:"runtime_platforms" json:"runtimePlatforms"`

        // Registry contains config related to the registry
        Registry Registry `toml:"registry" json:"registry"`

        // ImageDecryption contains config related to handling decryption of encrypted container images
        ImageDecryption `toml:"image_decryption" json:"imageDecryption"`

        // MaxConcurrentDownloads restricts the number of concurrent downloads for each image.
        // TODO: Migrate to transfer service
        MaxConcurrentDownloads int `toml:"max_concurrent_downloads" json:"maxConcurrentDownloads"`

        // ImagePullProgressTimeout is the maximum duration that there is no
        // image data read from image registry in the open connection. It will
        // be reset whatever a new byte has been read. If timeout, the image
        // pulling will be cancelled. A zero value means there is no timeout.
        //
        // The string is in the golang duration format, see:
        //   https://golang.org/pkg/time/#ParseDuration
        ImagePullProgressTimeout string `toml:"image_pull_progress_timeout" json:"imagePullProgressTimeout"`

        // ImagePullWithSyncFs is an experimental setting. It's to force sync
        // filesystem during unpacking to ensure that data integrity.
        // TODO: Migrate to transfer service
        ImagePullWithSyncFs bool `toml:"image_pull_with_sync_fs" json:"imagePullWithSyncFs"`

        // StatsCollectPeriod is the period (in seconds) of snapshots stats collection.
        StatsCollectPeriod int `toml:"stats_collect_period" json:"statsCollectPeriod"`
}

// RuntimeConfig contains toml config related to CRI plugin,
// it is a subset of Config.
type RuntimeConfig struct {
        // ContainerdConfig contains config related to containerd
        ContainerdConfig `toml:"containerd" json:"containerd"`
        // CniConfig contains config related to cni
        CniConfig `toml:"cni" json:"cni"`
        // EnableSelinux indicates to enable the selinux support.
        EnableSelinux bool `toml:"enable_selinux" json:"enableSelinux"`
        // SelinuxCategoryRange allows the upper bound on the category range to be set.
        // If not specified or set to 0, defaults to 1024 from the selinux package.
        SelinuxCategoryRange int `toml:"selinux_category_range" json:"selinuxCategoryRange"`
        // MaxContainerLogLineSize is the maximum log line size in bytes for a container.
        // Log line longer than the limit will be split into multiple lines. Non-positive
        // value means no limit.
        MaxContainerLogLineSize int `toml:"max_container_log_line_size" json:"maxContainerLogSize"`
        // DisableCgroup indicates to disable the cgroup support.
        // This is useful when the containerd does not have permission to access cgroup.
        DisableCgroup bool `toml:"disable_cgroup" json:"disableCgroup"`
        // DisableApparmor indicates to disable the apparmor support.
        // This is useful when the containerd does not have permission to access Apparmor.
        DisableApparmor bool `toml:"disable_apparmor" json:"disableApparmor"`
        // RestrictOOMScoreAdj indicates to limit the lower bound of OOMScoreAdj to the containerd's
        // current OOMScoreADj.
        // This is useful when the containerd does not have permission to decrease OOMScoreAdj.
        RestrictOOMScoreAdj bool `toml:"restrict_oom_score_adj" json:"restrictOOMScoreAdj"`
        // DisableProcMount disables Kubernetes ProcMount support. This MUST be set to `true`
        // when using containerd with Kubernetes <=1.11.
        DisableProcMount bool `toml:"disable_proc_mount" json:"disableProcMount"`
        // UnsetSeccompProfile is the profile containerd/cri will use If the provided seccomp profile is
        // unset (`""`) for a container (default is `unconfined`)
        UnsetSeccompProfile string `toml:"unset_seccomp_profile" json:"unsetSeccompProfile"`
        // TolerateMissingHugetlbController if set to false will error out on create/update
        // container requests with huge page limits if the cgroup controller for hugepages is not present.
        // This helps with supporting Kubernetes <=1.18 out of the box. (default is `true`)
        TolerateMissingHugetlbController bool `toml:"tolerate_missing_hugetlb_controller" json:"tolerateMissingHugetlbController"`
        // DisableHugetlbController indicates to silently disable the hugetlb controller, even when it is
        // present in /sys/fs/cgroup/cgroup.controllers.
        // This helps with running rootless mode + cgroup v2 + systemd but without hugetlb delegation.
        DisableHugetlbController bool `toml:"disable_hugetlb_controller" json:"disableHugetlbController"`
        // DeviceOwnershipFromSecurityContext changes the default behavior of setting container devices uid/gid
        // from CRI's SecurityContext (RunAsUser/RunAsGroup) instead of taking host's uid/gid. Defaults to false.
        DeviceOwnershipFromSecurityContext bool `toml:"device_ownership_from_security_context" json:"device_ownership_from_security_context"`
        // IgnoreImageDefinedVolumes ignores volumes defined by the image. Useful for better resource
        // isolation, security and early detection of issues in the mount configuration when using
        // ReadOnlyRootFilesystem since containers won't silently mount a temporary volume.
        IgnoreImageDefinedVolumes bool `toml:"ignore_image_defined_volumes" json:"ignoreImageDefinedVolumes"`
        // NetNSMountsUnderStateDir places all mounts for network namespaces under StateDir/netns instead
        // of being placed under the hardcoded directory /var/run/netns. Changing this setting requires
        // that all containers are deleted.
        NetNSMountsUnderStateDir bool `toml:"netns_mounts_under_state_dir" json:"netnsMountsUnderStateDir"`
        // EnableUnprivilegedPorts configures net.ipv4.ip_unprivileged_port_start=0
        // for all containers which are not using host network
        // and if it is not overwritten by PodSandboxConfig
        // Note that currently default is set to disabled but target change it in future, see:
        //   https://github.com/kubernetes/kubernetes/issues/102612
        EnableUnprivilegedPorts bool `toml:"enable_unprivileged_ports" json:"enableUnprivilegedPorts"`
        // EnableUnprivilegedICMP configures net.ipv4.ping_group_range="0 2147483647"
        // for all containers which are not using host network, are not running in user namespace
        // and if it is not overwritten by PodSandboxConfig
        // Note that currently default is set to disabled but target change it in future together with EnableUnprivilegedPorts
        EnableUnprivilegedICMP bool `toml:"enable_unprivileged_icmp" json:"enableUnprivilegedICMP"`
        // EnableCDI indicates to enable injection of the Container Device Interface Specifications
        // into the OCI config
        // For more details about CDI and the syntax of CDI Spec files please refer to
        // https://tags.cncf.io/container-device-interface.
        EnableCDI bool `toml:"enable_cdi" json:"enableCDI"`
        // CDISpecDirs is the list of directories to scan for Container Device Interface Specifications
        // For more details about CDI configuration please refer to
        // https://tags.cncf.io/container-device-interface#containerd-configuration
        CDISpecDirs []string `toml:"cdi_spec_dirs" json:"cdiSpecDirs"`

        // DrainExecSyncIOTimeout is the maximum duration to wait for ExecSync
        // API' IO EOF event after exec init process exits. A zero value means
        // there is no timeout.
        //
        // The string is in the golang duration format, see:
        //   https://golang.org/pkg/time/#ParseDuration
        //
        // For example, the value can be '5h', '2h30m', '10s'.
        DrainExecSyncIOTimeout string `toml:"drain_exec_sync_io_timeout" json:"drainExecSyncIOTimeout"`

        // IgnoreDeprecationWarnings is the list of the deprecation IDs (such as "io.containerd.deprecation/pull-schema-1-image")
        // that should be ignored for checking "ContainerdHasNoDeprecationWarnings" condition.
        IgnoreDeprecationWarnings []string `toml:"ignore_deprecation_warnings" json:"ignoreDeprecationWarnings"`
}

// X509KeyPairStreaming contains the x509 configuration for streaming
type X509KeyPairStreaming struct {
        // TLSCertFile is the path to a certificate file
        TLSCertFile string `toml:"tls_cert_file" json:"tlsCertFile"`
        // TLSKeyFile is the path to a private key file
        TLSKeyFile string `toml:"tls_key_file" json:"tlsKeyFile"`
}

// Config contains all configurations for CRI runtime plugin.
type Config struct {
        // RuntimeConfig is the config for CRI runtime.
        RuntimeConfig
        // ContainerdRootDir is the root directory path for containerd.
        ContainerdRootDir string `json:"containerdRootDir"`
        // ContainerdEndpoint is the containerd endpoint path.
        ContainerdEndpoint string `json:"containerdEndpoint"`
        // RootDir is the root directory path for managing cri plugin files
        // (metadata checkpoint etc.)
        RootDir string `json:"rootDir"`
        // StateDir is the root directory path for managing volatile pod/container data
        StateDir string `json:"stateDir"`
}

// ServerConfig contains all the configuration for the CRI API server.
type ServerConfig struct {
        // DisableTCPService disables serving CRI on the TCP server.
        DisableTCPService bool `toml:"disable_tcp_service" json:"disableTCPService"`
        // StreamServerAddress is the ip address streaming server is listening on.
        StreamServerAddress string `toml:"stream_server_address" json:"streamServerAddress"`
        // StreamServerPort is the port streaming server is listening on.
        StreamServerPort string `toml:"stream_server_port" json:"streamServerPort"`
        // StreamIdleTimeout is the maximum time a streaming connection
        // can be idle before the connection is automatically closed.
        // The string is in the golang duration format, see:
        //   https://golang.org/pkg/time/#ParseDuration
        StreamIdleTimeout string `toml:"stream_idle_timeout" json:"streamIdleTimeout"`
        // EnableTLSStreaming indicates to enable the TLS streaming support.
        EnableTLSStreaming bool `toml:"enable_tls_streaming" json:"enableTLSStreaming"`
        // X509KeyPairStreaming is a x509 key pair used for TLS streaming
        X509KeyPairStreaming `toml:"x509_key_pair_streaming" json:"x509KeyPairStreaming"`
}

const (
        // RuntimeUntrusted is the implicit runtime defined for ContainerdConfig.UntrustedWorkloadRuntime
        RuntimeUntrusted = "untrusted"
        // RuntimeDefault is the implicit runtime defined for ContainerdConfig.DefaultRuntime
        RuntimeDefault = "default"
        // KeyModelNode is the key model where key for encrypted images reside
        // on the worker nodes
        KeyModelNode = "node"
)

// ValidateImageConfig validates the given image configuration
func ValidateImageConfig(ctx context.Context, c *ImageConfig) ([]deprecation.Warning, error) {
        var warnings []deprecation.Warning

        useConfigPath := c.Registry.ConfigPath != ""
        if len(c.Registry.Mirrors) > 0 {
                if useConfigPath {
                        return warnings, errors.New("`mirrors` cannot be set when `config_path` is provided")
                }
                warnings = append(warnings, deprecation.CRIRegistryMirrors)
                log.G(ctx).Warning("`mirrors` is deprecated, please use `config_path` instead")
        }

        if len(c.Registry.Configs) != 0 {
                warnings = append(warnings, deprecation.CRIRegistryConfigs)
                log.G(ctx).Warning("`configs` is deprecated, please use `config_path` instead")
        }

        // Validation for deprecated auths options and mapping it to configs.
        if len(c.Registry.Auths) != 0 {
                if c.Registry.Configs == nil {
                        c.Registry.Configs = make(map[string]RegistryConfig)
                }
                for endpoint, auth := range c.Registry.Auths {
                        auth := auth
                        u, err := url.Parse(endpoint)
                        if err != nil {
                                return warnings, fmt.Errorf("failed to parse registry url %q from `registry.auths`: %w", endpoint, err)
                        }
                        if u.Scheme != "" {
                                // Do not include the scheme in the new registry config.
                                endpoint = u.Host
                        }
                        config := c.Registry.Configs[endpoint]
                        config.Auth = &auth
                        c.Registry.Configs[endpoint] = config
                }
                warnings = append(warnings, deprecation.CRIRegistryAuths)
                log.G(ctx).Warning("`auths` is deprecated, please use `ImagePullSecrets` instead")
        }

        // Validation for image_pull_progress_timeout
        if c.ImagePullProgressTimeout != "" {
                if _, err := time.ParseDuration(c.ImagePullProgressTimeout); err != nil {
                        return warnings, fmt.Errorf("invalid image pull progress timeout: %w", err)
                }
        }

        return warnings, nil
}

// ValidateRuntimeConfig validates the given runtime configuration.
func ValidateRuntimeConfig(ctx context.Context, c *RuntimeConfig) ([]deprecation.Warning, error) {
        var warnings []deprecation.Warning
        if c.ContainerdConfig.Runtimes == nil {
                c.ContainerdConfig.Runtimes = make(map[string]Runtime)
        }

        // Validation for default_runtime_name
        if c.ContainerdConfig.DefaultRuntimeName == "" {
                return warnings, errors.New("`default_runtime_name` is empty")
        }
        if _, ok := c.ContainerdConfig.Runtimes[c.ContainerdConfig.DefaultRuntimeName]; !ok {
                return warnings, fmt.Errorf("no corresponding runtime configured in `containerd.runtimes` for `containerd` `default_runtime_name = \"%s\"", c.ContainerdConfig.DefaultRuntimeName)
        }

        for k, r := range c.ContainerdConfig.Runtimes {
                if !r.PrivilegedWithoutHostDevices && r.PrivilegedWithoutHostDevicesAllDevicesAllowed {
                        return warnings, errors.New("`privileged_without_host_devices_all_devices_allowed` requires `privileged_without_host_devices` to be enabled")
                }
                // If empty, use default podSandbox mode
                if len(r.Sandboxer) == 0 {
                        r.Sandboxer = string(ModePodSandbox)
                        c.ContainerdConfig.Runtimes[k] = r
                }

                if len(r.IOType) == 0 {
                        r.IOType = IOTypeFifo
                }
                if r.IOType != IOTypeStreaming && r.IOType != IOTypeFifo {
                        return warnings, errors.New("`io_type` can only be `streaming` or `named_pipe`")
                }
        }

        // Validation for drain_exec_sync_io_timeout
        if c.DrainExecSyncIOTimeout != "" {
                if _, err := time.ParseDuration(c.DrainExecSyncIOTimeout); err != nil {
                        return warnings, fmt.Errorf("invalid `drain_exec_sync_io_timeout`: %w", err)
                }
        }
        if err := ValidateEnableUnprivileged(ctx, c); err != nil {
                return warnings, err
        }
        return warnings, nil
}

// ValidateServerConfig validates the given server configuration.
func ValidateServerConfig(ctx context.Context, c *ServerConfig) ([]deprecation.Warning, error) {
        var warnings []deprecation.Warning
        // Validation for stream_idle_timeout
        if c.StreamIdleTimeout != "" {
                if _, err := time.ParseDuration(c.StreamIdleTimeout); err != nil {
                        return warnings, fmt.Errorf("invalid stream idle timeout: %w", err)
                }
        }
        return warnings, nil
}

func (config *Config) GetSandboxRuntime(podSandboxConfig *runtime.PodSandboxConfig, runtimeHandler string) (Runtime, error) {
        if untrustedWorkload(podSandboxConfig) {
                // If the untrusted annotation is provided, runtimeHandler MUST be empty.
                if runtimeHandler != "" && runtimeHandler != RuntimeUntrusted {
                        return Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed")
                }

                //  If the untrusted workload is requesting access to the host/node, this request will fail.
                //
                //  Note: If the workload is marked untrusted but requests privileged, this can be granted, as the
                // runtime may support this.  For example, in a virtual-machine isolated runtime, privileged
                // is a supported option, granting the workload to access the entire guest VM instead of host.
                // TODO(windows): Deprecate this so that we don't need to handle it for windows.
                if hostAccessingSandbox(podSandboxConfig) {
                        return Runtime{}, errors.New("untrusted workload with host access is not allowed")
                }

                runtimeHandler = RuntimeUntrusted
        }

        if runtimeHandler == "" {
                runtimeHandler = config.DefaultRuntimeName
        }

        r, ok := config.Runtimes[runtimeHandler]
        if !ok {
                return Runtime{}, fmt.Errorf("no runtime for %q is configured", runtimeHandler)
        }
        return r, nil

}

// untrustedWorkload returns true if the sandbox contains untrusted workload.
func untrustedWorkload(config *runtime.PodSandboxConfig) bool {
        return config.GetAnnotations()[annotations.UntrustedWorkload] == "true"
}

// hostAccessingSandbox returns true if the sandbox configuration
// requires additional host access for the sandbox.
func hostAccessingSandbox(config *runtime.PodSandboxConfig) bool {
        securityContext := config.GetLinux().GetSecurityContext()

        namespaceOptions := securityContext.GetNamespaceOptions()
        if namespaceOptions.GetNetwork() == runtime.NamespaceMode_NODE ||
                namespaceOptions.GetPid() == runtime.NamespaceMode_NODE ||
                namespaceOptions.GetIpc() == runtime.NamespaceMode_NODE {
                return true
        }

        return false
}

// GenerateRuntimeOptions generates runtime options from cri plugin config.
func GenerateRuntimeOptions(r Runtime) (interface{}, error) {
        if r.Options == nil {
                return nil, nil
        }

        b, err := toml.Marshal(r.Options)
        if err != nil {
                return nil, fmt.Errorf("failed to marshal TOML blob for runtime %q: %w", r.Type, err)
        }

        options := getRuntimeOptionsType(r.Type)
        if err := toml.Unmarshal(b, options); err != nil {
                return nil, err
        }

        // For generic configuration, if no config path specified (preserving old behavior), pass
        // the whole TOML configuration section to the runtime.
        if runtimeOpts, ok := options.(*runtimeoptions.Options); ok && runtimeOpts.ConfigPath == "" {
                runtimeOpts.ConfigBody = b
        }

        return options, nil
}

// getRuntimeOptionsType gets empty runtime options by the runtime type name.
func getRuntimeOptionsType(t string) interface{} {
        switch t {
        case plugins.RuntimeRuncV2:
                return &runcoptions.Options{}
        case plugins.RuntimeRunhcsV1:
                return &runhcsoptions.Options{}
        default:
                return &runtimeoptions.Options{}
        }
}

func DefaultServerConfig() ServerConfig {
        return ServerConfig{
                DisableTCPService:   true,
                StreamServerAddress: "127.0.0.1",
                StreamServerPort:    "0",
                StreamIdleTimeout:   streaming.DefaultConfig.StreamIdleTimeout.String(), // 4 hour
                EnableTLSStreaming:  false,
                X509KeyPairStreaming: X509KeyPairStreaming{
                        TLSKeyFile:  "",
                        TLSCertFile: "",
                },
        }
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package config

import (
        "context"
        "errors"
        "fmt"

        kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
)

var kernelGreaterEqualThan = kernel.GreaterEqualThan

func ValidateEnableUnprivileged(ctx context.Context, c *RuntimeConfig) error {
        if c.EnableUnprivilegedICMP || c.EnableUnprivilegedPorts {
                fourDotEleven := kernel.KernelVersion{Kernel: 4, Major: 11}
                ok, err := kernelGreaterEqualThan(fourDotEleven)
                if err != nil {
                        return fmt.Errorf("check current system kernel version error: %w", err)
                }
                if !ok {
                        return errors.New("unprivileged_icmp and unprivileged_port require kernel version greater than or equal to 4.11")
                }
        }
        return nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package config

import (
        "github.com/containerd/containerd/v2/defaults"
        "github.com/pelletier/go-toml/v2"
)

func DefaultImageConfig() ImageConfig {
        return ImageConfig{
                Snapshotter:                defaults.DefaultSnapshotter,
                DisableSnapshotAnnotations: true,
                MaxConcurrentDownloads:     3,
                ImageDecryption: ImageDecryption{
                        KeyModel: KeyModelNode,
                },
                PinnedImages: map[string]string{
                        "sandbox": DefaultSandboxImage,
                },
                ImagePullProgressTimeout: defaultImagePullProgressTimeoutDuration.String(),
                ImagePullWithSyncFs:      false,
                StatsCollectPeriod:       10,
        }
}

// DefaultRuntimeConfig returns default configurations of cri plugin.
func DefaultRuntimeConfig() RuntimeConfig {
        defaultRuncV2Opts := `
        # NoNewKeyring disables new keyring for the container.
        NoNewKeyring = false

        # ShimCgroup places the shim in a cgroup.
        ShimCgroup = ""

        # IoUid sets the I/O's pipes uid.
        IoUid = 0

        # IoGid sets the I/O's pipes gid.
        IoGid = 0

        # BinaryName is the binary name of the runc binary.
        BinaryName = ""

        # Root is the runc root directory.
        Root = ""

        # CriuImagePath is the criu image path
        CriuImagePath = ""

        # CriuWorkPath is the criu work path.
        CriuWorkPath = ""
`
        var m map[string]interface{}
        toml.Unmarshal([]byte(defaultRuncV2Opts), &m)

        return RuntimeConfig{
                CniConfig: CniConfig{
                        NetworkPluginBinDir:        "/opt/cni/bin",
                        NetworkPluginConfDir:       "/etc/cni/net.d",
                        NetworkPluginMaxConfNum:    1, // only one CNI plugin config file will be loaded
                        NetworkPluginSetupSerially: false,
                        NetworkPluginConfTemplate:  "",
                },
                ContainerdConfig: ContainerdConfig{
                        DefaultRuntimeName: "runc",
                        Runtimes: map[string]Runtime{
                                "runc": {
                                        Type:      "io.containerd.runc.v2",
                                        Options:   m,
                                        Sandboxer: string(ModePodSandbox),
                                },
                        },
                },
                EnableSelinux:                    false,
                SelinuxCategoryRange:             1024,
                MaxContainerLogLineSize:          16 * 1024,
                DisableProcMount:                 false,
                TolerateMissingHugetlbController: true,
                DisableHugetlbController:         true,
                IgnoreImageDefinedVolumes:        false,
                EnableCDI:                        true,
                CDISpecDirs:                      []string{"/etc/cdi", "/var/run/cdi"},
                DrainExecSyncIOTimeout:           "0s",
                EnableUnprivilegedPorts:          true,
                EnableUnprivilegedICMP:           true,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package config

import (
        "crypto/tls"
        "errors"
        "fmt"
        "net"
        "os"
        "time"

        k8snet "k8s.io/apimachinery/pkg/util/net"
        k8scert "k8s.io/client-go/util/cert"

        "k8s.io/kubelet/pkg/cri/streaming"
)

type streamListenerMode int

const (
        x509KeyPairTLS streamListenerMode = iota
        selfSignTLS
        withoutTLS
)

func getStreamListenerMode(config *ServerConfig) (streamListenerMode, error) {
        if config.EnableTLSStreaming {
                if config.X509KeyPairStreaming.TLSCertFile != "" && config.X509KeyPairStreaming.TLSKeyFile != "" {
                        return x509KeyPairTLS, nil
                }
                if config.X509KeyPairStreaming.TLSCertFile != "" && config.X509KeyPairStreaming.TLSKeyFile == "" {
                        return -1, errors.New("must set X509KeyPairStreaming.TLSKeyFile")
                }
                if config.X509KeyPairStreaming.TLSCertFile == "" && config.X509KeyPairStreaming.TLSKeyFile != "" {
                        return -1, errors.New("must set X509KeyPairStreaming.TLSCertFile")
                }
                return selfSignTLS, nil
        }
        if config.X509KeyPairStreaming.TLSCertFile != "" {
                return -1, errors.New("X509KeyPairStreaming.TLSCertFile is set but EnableTLSStreaming is not set")
        }
        if config.X509KeyPairStreaming.TLSKeyFile != "" {
                return -1, errors.New("X509KeyPairStreaming.TLSKeyFile is set but EnableTLSStreaming is not set")
        }
        return withoutTLS, nil
}

func (c *ServerConfig) StreamingConfig() (streaming.Config, error) {
        var (
                addr              = c.StreamServerAddress
                port              = c.StreamServerPort
                streamIdleTimeout = c.StreamIdleTimeout
        )
        if addr == "" {
                a, err := k8snet.ResolveBindAddress(nil)
                if err != nil {
                        return streaming.Config{}, fmt.Errorf("failed to get stream server address: %w", err)
                }
                addr = a.String()
        }
        config := streaming.DefaultConfig
        if streamIdleTimeout != "" {
                var err error
                config.StreamIdleTimeout, err = time.ParseDuration(streamIdleTimeout)
                if err != nil {
                        return streaming.Config{}, fmt.Errorf("invalid stream idle timeout: %w", err)
                }
        }
        config.Addr = net.JoinHostPort(addr, port)

        tlsMode, err := getStreamListenerMode(c)
        if err != nil {
                return streaming.Config{}, fmt.Errorf("invalid stream server configuration: %w", err)
        }
        switch tlsMode {
        case x509KeyPairTLS:
                tlsCert, err := tls.LoadX509KeyPair(c.X509KeyPairStreaming.TLSCertFile, c.X509KeyPairStreaming.TLSKeyFile)
                if err != nil {
                        return streaming.Config{}, fmt.Errorf("failed to load x509 key pair for stream server: %w", err)
                }
                config.TLSConfig = &tls.Config{
                        Certificates: []tls.Certificate{tlsCert},
                }
        case selfSignTLS:
                tlsCert, err := newTLSCert()
                if err != nil {
                        return streaming.Config{}, fmt.Errorf("failed to generate tls certificate for stream server: %w", err)
                }
                config.TLSConfig = &tls.Config{
                        Certificates: []tls.Certificate{tlsCert},
                }
        case withoutTLS:
        default:
                return streaming.Config{}, errors.New("invalid configuration for the stream listener")
        }
        return config, nil
}

// newTLSCert returns a self CA signed tls.certificate.
// TODO (mikebrow): replace / rewrite this function to support using CA
// signing of the certificate. Requires a security plan for kubernetes regarding
// CRI connections / streaming, etc. For example, kubernetes could configure or
// require a CA service and pass a configuration down through CRI.
func newTLSCert() (tls.Certificate, error) {
        fail := func(err error) (tls.Certificate, error) { return tls.Certificate{}, err }

        hostName, err := os.Hostname()
        if err != nil {
                return fail(fmt.Errorf("failed to get hostname: %w", err))
        }

        addrs, err := net.InterfaceAddrs()
        if err != nil {
                return fail(fmt.Errorf("failed to get host IP addresses: %w", err))
        }

        var alternateIPs []net.IP
        var alternateDNS []string
        for _, addr := range addrs {
                var ip net.IP

                switch v := addr.(type) {
                case *net.IPNet:
                        ip = v.IP
                case *net.IPAddr:
                        ip = v.IP
                default:
                        continue
                }

                alternateIPs = append(alternateIPs, ip)
                alternateDNS = append(alternateDNS, ip.String())
        }

        // Generate a self signed certificate key (CA is self)
        certPem, keyPem, err := k8scert.GenerateSelfSignedCertKey(hostName, alternateIPs, alternateDNS)
        if err != nil {
                return fail(fmt.Errorf("certificate key could not be created: %w", err))
        }

        // Load the tls certificate
        tlsCert, err := tls.X509KeyPair(certPem, keyPem)
        if err != nil {
                return fail(fmt.Errorf("certificate could not be loaded: %w", err))
        }

        return tlsCert, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package instrument

import (
        "context"

        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
)

const (
        // criSpanPrefix is a prefix for CRI server specific spans
        criSpanPrefix = "pkg.cri.server"
)

// criService is an CRI server dependency to be wrapped with instrumentation.
type criService interface {
        GRPCServices

        IsInitialized() bool
}

// GRPCServices are all the grpc services provided by cri containerd.
type GRPCServices interface {
        runtime.RuntimeServiceServer
        runtime.ImageServiceServer
}

// instrumentedService wraps service with containerd namespace and logs.
type instrumentedService struct {
        c criService
}

func NewService(c criService) GRPCServices {
        return &instrumentedService{c: c}
}

// checkInitialized returns error if the server is not fully initialized.
// GRPC service request handlers should return error before server is fully
// initialized.
// NOTE(random-liu): All following functions MUST check initialized at the beginning.
func (in *instrumentedService) checkInitialized() error {
        if in.c.IsInitialized() {
                return nil
        }
        return errdefs.ToGRPCf(errdefs.ErrUnavailable, "server is not initialized yet")
}

func (in *instrumentedService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (res *runtime.RunPodSandboxResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("RunPodSandbox for %+v", r.GetConfig().GetMetadata())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("RunPodSandbox for %+v failed, error", r.GetConfig().GetMetadata())
                } else {
                        log.G(ctx).Infof("RunPodSandbox for %+v returns sandbox id %q", r.GetConfig().GetMetadata(), res.GetPodSandboxId())
                }
        }()
        res, err = in.c.RunPodSandbox(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandboxRequest) (res *runtime.ListPodSandboxResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("ListPodSandbox with filter %+v", r.GetFilter())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("ListPodSandbox failed")
                } else {
                        log.G(ctx).Tracef("ListPodSandbox returns pod sandboxes %+v", res.GetItems())
                }
        }()
        res, err = in.c.ListPodSandbox(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) PodSandboxStatus(ctx context.Context, r *runtime.PodSandboxStatusRequest) (res *runtime.PodSandboxStatusResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("PodSandboxStatus for %q", r.GetPodSandboxId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("PodSandboxStatus for %q failed", r.GetPodSandboxId())
                } else {
                        log.G(ctx).Tracef("PodSandboxStatus for %q returns status %+v", r.GetPodSandboxId(), res.GetStatus())
                }
        }()
        res, err = in.c.PodSandboxStatus(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandboxRequest) (_ *runtime.StopPodSandboxResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("StopPodSandbox for %q", r.GetPodSandboxId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("StopPodSandbox for %q failed", r.GetPodSandboxId())
                } else {
                        log.G(ctx).Infof("StopPodSandbox for %q returns successfully", r.GetPodSandboxId())
                }
        }()
        res, err := in.c.StopPodSandbox(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodSandboxRequest) (_ *runtime.RemovePodSandboxResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("RemovePodSandbox for %q", r.GetPodSandboxId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("RemovePodSandbox for %q failed", r.GetPodSandboxId())
                } else {
                        log.G(ctx).Infof("RemovePodSandbox %q returns successfully", r.GetPodSandboxId())
                }
        }()
        res, err := in.c.RemovePodSandbox(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) PortForward(ctx context.Context, r *runtime.PortForwardRequest) (res *runtime.PortForwardResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("Portforward for %q port %v", r.GetPodSandboxId(), r.GetPort())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("Portforward for %q failed", r.GetPodSandboxId())
                } else {
                        log.G(ctx).Infof("Portforward for %q returns URL %q", r.GetPodSandboxId(), res.GetUrl())
                }
        }()
        res, err = in.c.PortForward(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) CreateContainer(ctx context.Context, r *runtime.CreateContainerRequest) (res *runtime.CreateContainerResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("CreateContainer within sandbox %q for container %+v",
                r.GetPodSandboxId(), r.GetConfig().GetMetadata())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("CreateContainer within sandbox %q for %+v failed",
                                r.GetPodSandboxId(), r.GetConfig().GetMetadata())
                } else {
                        log.G(ctx).Infof("CreateContainer within sandbox %q for %+v returns container id %q",
                                r.GetPodSandboxId(), r.GetConfig().GetMetadata(), res.GetContainerId())
                }
        }()
        res, err = in.c.CreateContainer(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (_ *runtime.StartContainerResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("StartContainer for %q", r.GetContainerId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("StartContainer for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Infof("StartContainer for %q returns successfully", r.GetContainerId())
                }
        }()
        res, err := in.c.StartContainer(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListContainers(ctx context.Context, r *runtime.ListContainersRequest) (res *runtime.ListContainersResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("ListContainers with filter %+v", r.GetFilter())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ListContainers with filter %+v failed", r.GetFilter())
                } else {
                        log.G(ctx).Tracef("ListContainers with filter %+v returns containers %+v",
                                r.GetFilter(), res.GetContainers())
                }
        }()
        res, err = in.c.ListContainers(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ContainerStatus(ctx context.Context, r *runtime.ContainerStatusRequest) (res *runtime.ContainerStatusResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("ContainerStatus for %q", r.GetContainerId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ContainerStatus for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Tracef("ContainerStatus for %q returns status %+v", r.GetContainerId(), res.GetStatus())
                }
        }()
        res, err = in.c.ContainerStatus(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (res *runtime.StopContainerResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("StopContainer for %q with timeout %d (s)", r.GetContainerId(), r.GetTimeout())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("StopContainer for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Infof("StopContainer for %q returns successfully", r.GetContainerId())
                }
        }()
        res, err = in.c.StopContainer(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (res *runtime.RemoveContainerResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("RemoveContainer for %q", r.GetContainerId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("RemoveContainer for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Infof("RemoveContainer for %q returns successfully", r.GetContainerId())
                }
        }()
        res, err = in.c.RemoveContainer(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ExecSync(ctx context.Context, r *runtime.ExecSyncRequest) (res *runtime.ExecSyncResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Debugf("ExecSync for %q with command %+v and timeout %d (s)", r.GetContainerId(), r.GetCmd(), r.GetTimeout())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ExecSync for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Tracef("ExecSync for %q returns with exit code %d", r.GetContainerId(), res.GetExitCode())
                }
        }()
        res, err = in.c.ExecSync(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) Exec(ctx context.Context, r *runtime.ExecRequest) (res *runtime.ExecResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Debugf("Exec for %q with command %+v, tty %v and stdin %v",
                r.GetContainerId(), r.GetCmd(), r.GetTty(), r.GetStdin())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("Exec for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Debugf("Exec for %q returns URL %q", r.GetContainerId(), res.GetUrl())
                }
        }()
        res, err = in.c.Exec(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) Attach(ctx context.Context, r *runtime.AttachRequest) (res *runtime.AttachResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Debugf("Attach for %q with tty %v and stdin %v", r.GetContainerId(), r.GetTty(), r.GetStdin())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("Attach for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Debugf("Attach for %q returns URL %q", r.GetContainerId(), res.Url)
                }
        }()
        res, err = in.c.Attach(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) UpdateContainerResources(ctx context.Context, r *runtime.UpdateContainerResourcesRequest) (res *runtime.UpdateContainerResourcesResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Infof("UpdateContainerResources for %q with Linux: %+v / Windows: %+v", r.GetContainerId(), r.GetLinux(), r.GetWindows())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("UpdateContainerResources for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Infof("UpdateContainerResources for %q returns successfully", r.GetContainerId())
                }
        }()
        res, err = in.c.UpdateContainerResources(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) PullImage(ctx context.Context, r *runtime.PullImageRequest) (res *runtime.PullImageResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        ctx, span := tracing.StartSpan(ctx, tracing.Name(criSpanPrefix, "PullImage"))
        defer span.End()
        log.G(ctx).Infof("PullImage %q", r.GetImage().GetImage())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("PullImage %q failed", r.GetImage().GetImage())
                } else {
                        log.G(ctx).Infof("PullImage %q returns image reference %q",
                                r.GetImage().GetImage(), res.GetImageRef())
                }
                span.SetStatus(err)
        }()
        res, err = in.c.PullImage(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListImages(ctx context.Context, r *runtime.ListImagesRequest) (res *runtime.ListImagesResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        ctx, span := tracing.StartSpan(ctx, tracing.Name(criSpanPrefix, "ListImages"))
        defer span.End()
        log.G(ctx).Tracef("ListImages with filter %+v", r.GetFilter())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ListImages with filter %+v failed", r.GetFilter())
                } else {
                        log.G(ctx).Tracef("ListImages with filter %+v returns image list %+v",
                                r.GetFilter(), res.GetImages())
                }
                span.SetStatus(err)
        }()
        res, err = in.c.ListImages(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ImageStatus(ctx context.Context, r *runtime.ImageStatusRequest) (res *runtime.ImageStatusResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        ctx, span := tracing.StartSpan(ctx, tracing.Name(criSpanPrefix, "ImageStatus"))
        defer span.End()
        log.G(ctx).Tracef("ImageStatus for %q", r.GetImage().GetImage())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ImageStatus for %q failed", r.GetImage().GetImage())
                } else {
                        log.G(ctx).Tracef("ImageStatus for %q returns image status %+v",
                                r.GetImage().GetImage(), res.GetImage())
                }
                span.SetStatus(err)
        }()
        res, err = in.c.ImageStatus(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) RemoveImage(ctx context.Context, r *runtime.RemoveImageRequest) (_ *runtime.RemoveImageResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        ctx, span := tracing.StartSpan(ctx, tracing.Name(criSpanPrefix, "RemoveImage"))
        defer span.End()
        log.G(ctx).Infof("RemoveImage %q", r.GetImage().GetImage())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("RemoveImage %q failed", r.GetImage().GetImage())
                } else {
                        log.G(ctx).Infof("RemoveImage %q returns successfully", r.GetImage().GetImage())
                }
                span.SetStatus(err)
        }()
        res, err := in.c.RemoveImage(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ImageFsInfo(ctx context.Context, r *runtime.ImageFsInfoRequest) (res *runtime.ImageFsInfoResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        ctx, span := tracing.StartSpan(ctx, tracing.Name(criSpanPrefix, "ImageFsInfo"))
        defer span.End()
        log.G(ctx).Tracef("ImageFsInfo")
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("ImageFsInfo failed")
                } else {
                        log.G(ctx).Tracef("ImageFsInfo returns filesystem info %+v", res.ImageFilesystems)
                }
                span.SetStatus(err)
        }()
        res, err = in.c.ImageFsInfo(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) PodSandboxStats(ctx context.Context, r *runtime.PodSandboxStatsRequest) (res *runtime.PodSandboxStatsResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("PodSandboxStats for %q", r.GetPodSandboxId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("PodSandboxStats for %q failed", r.GetPodSandboxId())
                } else {
                        log.G(ctx).Tracef("PodSandboxStats for %q returns stats %+v", r.GetPodSandboxId(), res.GetStats())
                }
        }()
        res, err = in.c.PodSandboxStats(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ContainerStats(ctx context.Context, r *runtime.ContainerStatsRequest) (res *runtime.ContainerStatsResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("ContainerStats for %q", r.GetContainerId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ContainerStats for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Tracef("ContainerStats for %q returns stats %+v", r.GetContainerId(), res.GetStats())
                }
        }()
        res, err = in.c.ContainerStats(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListPodSandboxStats(ctx context.Context, r *runtime.ListPodSandboxStatsRequest) (res *runtime.ListPodSandboxStatsResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("ListPodSandboxStats with filter %+v", r.GetFilter())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("ListPodSandboxStats failed")
                } else {
                        log.G(ctx).Tracef("ListPodSandboxStats returns stats %+v", res.GetStats())
                }
        }()
        res, err = in.c.ListPodSandboxStats(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListContainerStats(ctx context.Context, r *runtime.ListContainerStatsRequest) (res *runtime.ListContainerStatsResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("ListContainerStats with filter %+v", r.GetFilter())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("ListContainerStats failed")
                } else {
                        log.G(ctx).Tracef("ListContainerStats returns stats %+v", res.GetStats())
                }
        }()
        res, err = in.c.ListContainerStats(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) Status(ctx context.Context, r *runtime.StatusRequest) (res *runtime.StatusResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("Status")
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("Status failed")
                } else {
                        log.G(ctx).Tracef("Status returns status %+v", res.GetStatus())
                }
        }()
        res, err = in.c.Status(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) Version(ctx context.Context, r *runtime.VersionRequest) (res *runtime.VersionResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("Version with client side version %q", r.GetVersion())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("Version failed")
                } else {
                        log.G(ctx).Tracef("Version returns %+v", res)
                }
        }()
        res, err = in.c.Version(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) UpdateRuntimeConfig(ctx context.Context, r *runtime.UpdateRuntimeConfigRequest) (res *runtime.UpdateRuntimeConfigResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Debugf("UpdateRuntimeConfig with config %+v", r.GetRuntimeConfig())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("UpdateRuntimeConfig failed")
                } else {
                        log.G(ctx).Debug("UpdateRuntimeConfig returns successfully")
                }
        }()
        res, err = in.c.UpdateRuntimeConfig(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ReopenContainerLog(ctx context.Context, r *runtime.ReopenContainerLogRequest) (res *runtime.ReopenContainerLogResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Debugf("ReopenContainerLog for %q", r.GetContainerId())
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ReopenContainerLog for %q failed", r.GetContainerId())
                } else {
                        log.G(ctx).Debugf("ReopenContainerLog for %q returns successfully", r.GetContainerId())
                }
        }()
        res, err = in.c.ReopenContainerLog(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) CheckpointContainer(ctx context.Context, r *runtime.CheckpointContainerRequest) (res *runtime.CheckpointContainerResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }

        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("CheckpointContainer failed, error")
                } else {
                        log.G(ctx).Debug("CheckpointContainer returns successfully")
                }
        }()

        res, err = in.c.CheckpointContainer(ctrdutil.WithNamespace(ctx), r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) GetContainerEvents(r *runtime.GetEventsRequest, s runtime.RuntimeService_GetContainerEventsServer) (err error) {
        if err := in.checkInitialized(); err != nil {
                return err
        }

        ctx := s.Context()
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("GetContainerEvents failed, error")
                } else {
                        log.G(ctx).Debug("GetContainerEvents returns successfully")
                }
        }()

        err = in.c.GetContainerEvents(r, s)
        return errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListMetricDescriptors(ctx context.Context, r *runtime.ListMetricDescriptorsRequest) (res *runtime.ListMetricDescriptorsResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }

        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ListMetricDescriptors failed, error")
                } else {
                        log.G(ctx).Trace("ListMetricDescriptors returns successfully")
                }
        }()

        res, err = in.c.ListMetricDescriptors(ctx, r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) ListPodSandboxMetrics(ctx context.Context, r *runtime.ListPodSandboxMetricsRequest) (res *runtime.ListPodSandboxMetricsResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }

        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("ListPodSandboxMetrics failed, error")
                } else {
                        log.G(ctx).Trace("ListPodSandboxMetrics returns successfully")
                }
        }()

        res, err = in.c.ListPodSandboxMetrics(ctx, r)
        return res, errdefs.ToGRPC(err)
}

func (in *instrumentedService) RuntimeConfig(ctx context.Context, r *runtime.RuntimeConfigRequest) (res *runtime.RuntimeConfigResponse, err error) {
        if err := in.checkInitialized(); err != nil {
                return nil, err
        }
        log.G(ctx).Tracef("RuntimeConfig")
        defer func() {
                if err != nil {
                        log.G(ctx).WithError(err).Error("RuntimeConfig failed")
                } else {
                        log.G(ctx).Tracef("RuntimeConfig returns config %+v", res)
                }
        }()
        res, err = in.c.RuntimeConfig(ctx, r)
        return res, errdefs.ToGRPC(err)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package io

import (
        "errors"
        "fmt"
        "io"
        "strings"
        "sync"

        "github.com/containerd/log"

        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/cio"
        cioutil "github.com/containerd/containerd/v2/pkg/ioutil"
)

// streamKey generates a key for the stream.
func streamKey(id, name string, stream StreamType) string {
        return strings.Join([]string{id, name, string(stream)}, "-")
}

// ContainerIO holds the container io.
type ContainerIO struct {
        id string

        fifos *cio.FIFOSet
        *stdioStream

        stdoutGroup *cioutil.WriterGroup
        stderrGroup *cioutil.WriterGroup

        closer *wgCloser
}

var _ cio.IO = &ContainerIO{}

// ContainerIOOpts sets specific information to newly created ContainerIO.
type ContainerIOOpts func(*ContainerIO) error

// WithFIFOs specifies existing fifos for the container io.
func WithFIFOs(fifos *cio.FIFOSet) ContainerIOOpts {
        return func(c *ContainerIO) error {
                c.fifos = fifos
                return nil
        }
}

// WithNewFIFOs creates new fifos for the container io.
func WithNewFIFOs(root string, tty, stdin bool) ContainerIOOpts {
        return func(c *ContainerIO) error {
                fifos, err := newFifos(root, c.id, tty, stdin)
                if err != nil {
                        return err
                }
                return WithFIFOs(fifos)(c)
        }
}

// WithStreams creates new streams for the container io.
// The stream address is in format of `protocol://address?stream_id=xyz`.
// It allocates ContainerID-stdin, ContainerID-stdout and ContainerID-stderr as streaming IDs.
// For example, that advertiser address of shim is `ttrpc+unix:///run/demo.sock` and container ID is `app`.
// There are three streams if stdin is enabled and TTY is disabled.
//
//   - Stdin: ttrpc+unix:///run/demo.sock?stream_id=app-stdin
//   - Stdout: ttrpc+unix:///run/demo.sock?stream_id=app-stdout
//   - stderr: ttrpc+unix:///run/demo.sock?stream_id=app-stderr
//
// The streaming IDs will be used as unique key to establish stream tunnel.
// And it should support reconnection with the same streaming ID if containerd restarts.
func WithStreams(address string, tty, stdin bool) ContainerIOOpts {
        return func(c *ContainerIO) error {
                if address == "" {
                        return fmt.Errorf("address can not be empty for io stream")
                }
                fifos, err := newStreams(address, c.id, tty, stdin)
                if err != nil {
                        return err
                }
                return WithFIFOs(fifos)(c)
        }
}

// NewContainerIO creates container io.
func NewContainerIO(id string, opts ...ContainerIOOpts) (_ *ContainerIO, err error) {
        c := &ContainerIO{
                id:          id,
                stdoutGroup: cioutil.NewWriterGroup(),
                stderrGroup: cioutil.NewWriterGroup(),
        }
        for _, opt := range opts {
                if err := opt(c); err != nil {
                        return nil, err
                }
        }
        if c.fifos == nil {
                return nil, errors.New("fifos are not set")
        }
        // Create actual fifos.
        stdio, closer, err := newStdioStream(c.fifos)
        if err != nil {
                return nil, err
        }
        c.stdioStream = stdio
        c.closer = closer
        return c, nil
}

// Config returns io config.
func (c *ContainerIO) Config() cio.Config {
        return c.fifos.Config
}

// Pipe creates container fifos and pipe container output
// to output stream.
func (c *ContainerIO) Pipe() {
        wg := c.closer.wg
        if c.stdout != nil {
                wg.Add(1)
                go func() {
                        if _, err := io.Copy(c.stdoutGroup, c.stdout); err != nil {
                                log.L.WithError(err).Errorf("Failed to pipe stdout of container %q", c.id)
                        }
                        c.stdout.Close()
                        c.stdoutGroup.Close()
                        wg.Done()
                        log.L.Debugf("Finish piping stdout of container %q", c.id)
                }()
        }

        if !c.fifos.Terminal && c.stderr != nil {
                wg.Add(1)
                go func() {
                        if _, err := io.Copy(c.stderrGroup, c.stderr); err != nil {
                                log.L.WithError(err).Errorf("Failed to pipe stderr of container %q", c.id)
                        }
                        c.stderr.Close()
                        c.stderrGroup.Close()
                        wg.Done()
                        log.L.Debugf("Finish piping stderr of container %q", c.id)
                }()
        }
}

// Attach attaches container stdio.
// TODO(random-liu): Use pools.Copy in docker to reduce memory usage?
func (c *ContainerIO) Attach(opts AttachOptions) {
        var wg sync.WaitGroup
        key := util.GenerateID()
        stdinKey := streamKey(c.id, "attach-"+key, Stdin)
        stdoutKey := streamKey(c.id, "attach-"+key, Stdout)
        stderrKey := streamKey(c.id, "attach-"+key, Stderr)

        var stdinStreamRC io.ReadCloser
        if c.stdin != nil && opts.Stdin != nil {
                // Create a wrapper of stdin which could be closed. Note that the
                // wrapper doesn't close the actual stdin, it only stops io.Copy.
                // The actual stdin will be closed by stream server.
                stdinStreamRC = cioutil.NewWrapReadCloser(opts.Stdin)
                wg.Add(1)
                go func() {
                        if _, err := io.Copy(c.stdin, stdinStreamRC); err != nil {
                                log.L.WithError(err).Errorf("Failed to pipe stdin for container attach %q", c.id)
                        }
                        log.L.Infof("Attach stream %q closed", stdinKey)
                        if opts.StdinOnce && !opts.Tty {
                                // Due to kubectl requirements and current docker behavior, when (opts.StdinOnce &&
                                // opts.Tty) we have to close container stdin and keep stdout and stderr open until
                                // container stops.
                                c.stdin.Close()
                                // Also closes the containerd side.
                                if err := opts.CloseStdin(); err != nil {
                                        log.L.WithError(err).Errorf("Failed to close stdin for container %q", c.id)
                                }
                        } else {
                                if opts.Stdout != nil {
                                        c.stdoutGroup.Remove(stdoutKey)
                                }
                                if opts.Stderr != nil {
                                        c.stderrGroup.Remove(stderrKey)
                                }
                        }
                        wg.Done()
                }()
        }

        attachStream := func(key string, close <-chan struct{}) {
                <-close
                log.L.Infof("Attach stream %q closed", key)
                // Make sure stdin gets closed.
                if stdinStreamRC != nil {
                        stdinStreamRC.Close()
                }
                wg.Done()
        }

        if opts.Stdout != nil {
                wg.Add(1)
                wc, close := cioutil.NewWriteCloseInformer(opts.Stdout)
                c.stdoutGroup.Add(stdoutKey, wc)
                go attachStream(stdoutKey, close)
        }
        if !opts.Tty && opts.Stderr != nil {
                wg.Add(1)
                wc, close := cioutil.NewWriteCloseInformer(opts.Stderr)
                c.stderrGroup.Add(stderrKey, wc)
                go attachStream(stderrKey, close)
        }
        wg.Wait()
}

// AddOutput adds new write closers to the container stream, and returns existing
// write closers if there are any.
func (c *ContainerIO) AddOutput(name string, stdout, stderr io.WriteCloser) (io.WriteCloser, io.WriteCloser) {
        var oldStdout, oldStderr io.WriteCloser
        if stdout != nil {
                key := streamKey(c.id, name, Stdout)
                oldStdout = c.stdoutGroup.Get(key)
                c.stdoutGroup.Add(key, stdout)
        }
        if stderr != nil {
                key := streamKey(c.id, name, Stderr)
                oldStderr = c.stderrGroup.Get(key)
                c.stderrGroup.Add(key, stderr)
        }
        return oldStdout, oldStderr
}

// Cancel cancels container io.
func (c *ContainerIO) Cancel() {
        c.closer.Cancel()
}

// Wait waits container io to finish.
func (c *ContainerIO) Wait() {
        c.closer.Wait()
}

// Close closes all FIFOs.
func (c *ContainerIO) Close() error {
        c.closer.Close()
        if c.fifos != nil {
                return c.fifos.Close()
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package io

import (
        "io"
        "sync"

        "github.com/containerd/log"

        "github.com/containerd/containerd/v2/pkg/cio"
        cioutil "github.com/containerd/containerd/v2/pkg/ioutil"
)

// ExecIO holds the exec io.
type ExecIO struct {
        id    string
        fifos *cio.FIFOSet
        *stdioStream
        closer *wgCloser
}

var _ cio.IO = &ExecIO{}

// NewFifoExecIO creates exec io by named pipes.
func NewFifoExecIO(id, root string, tty, stdin bool) (*ExecIO, error) {
        fifos, err := newFifos(root, id, tty, stdin)
        if err != nil {
                return nil, err
        }
        stdio, closer, err := newStdioStream(fifos)
        if err != nil {
                return nil, err
        }
        return &ExecIO{
                id:          id,
                fifos:       fifos,
                stdioStream: stdio,
                closer:      closer,
        }, nil
}

// NewStreamExecIO creates exec io with streaming.
// The stream address is in format of `protocol://address?stream_id=xyz`.
// It allocates ExecID-stdin, ExecID-stdout and ExecID-stderr as streaming IDs.
// For example, that advertiser address of shim is `ttrpc+unix:///run/demo.sock` and exec ID is `app`.
// There are three streams if stdin is enabled and TTY is disabled.
//
//   - Stdin: ttrpc+unix:///run/demo.sock?stream_id=app-stdin
//   - Stdout: ttrpc+unix:///run/demo.sock?stream_id=app-stdout
//   - stderr: ttrpc+unix:///run/demo.sock?stream_id=app-stderr
//
// The streaming IDs will be used as unique key to establish stream tunnel.
// And it should support reconnection with the same streaming ID if containerd restarts.
func NewStreamExecIO(id, address string, tty, stdin bool) (*ExecIO, error) {
        fifos, err := newStreams(address, id, tty, stdin)
        if err != nil {
                return nil, err
        }
        stdio, closer, err := newStdioStream(fifos)
        if err != nil {
                return nil, err
        }
        return &ExecIO{
                id:          id,
                fifos:       fifos,
                stdioStream: stdio,
                closer:      closer,
        }, nil
}

// Config returns io config.
func (e *ExecIO) Config() cio.Config {
        return e.fifos.Config
}

// Attach attaches exec stdio. The logic is similar with container io attach.
func (e *ExecIO) Attach(opts AttachOptions) <-chan struct{} {
        var wg sync.WaitGroup
        var stdinStreamRC io.ReadCloser
        if e.stdin != nil && opts.Stdin != nil {
                stdinStreamRC = cioutil.NewWrapReadCloser(opts.Stdin)
                wg.Add(1)
                go func() {
                        if _, err := io.Copy(e.stdin, stdinStreamRC); err != nil {
                                log.L.WithError(err).Errorf("Failed to redirect stdin for container exec %q", e.id)
                        }
                        log.L.Infof("Container exec %q stdin closed", e.id)
                        if opts.StdinOnce && !opts.Tty {
                                e.stdin.Close()
                                if err := opts.CloseStdin(); err != nil {
                                        log.L.WithError(err).Errorf("Failed to close stdin for container exec %q", e.id)
                                }
                        } else {
                                if e.stdout != nil {
                                        e.stdout.Close()
                                }
                                if e.stderr != nil {
                                        e.stderr.Close()
                                }
                        }
                        wg.Done()
                }()
        }

        attachOutput := func(t StreamType, stream io.WriteCloser, out io.ReadCloser) {
                if _, err := io.Copy(stream, out); err != nil {
                        log.L.WithError(err).Errorf("Failed to pipe %q for container exec %q", t, e.id)
                }
                out.Close()
                stream.Close()
                if stdinStreamRC != nil {
                        stdinStreamRC.Close()
                }
                e.closer.wg.Done()
                wg.Done()
                log.L.Debugf("Finish piping %q of container exec %q", t, e.id)
        }

        if opts.Stdout != nil {
                wg.Add(1)
                // Closer should wait for this routine to be over.
                e.closer.wg.Add(1)
                go attachOutput(Stdout, opts.Stdout, e.stdout)
        }

        if !opts.Tty && opts.Stderr != nil {
                wg.Add(1)
                // Closer should wait for this routine to be over.
                e.closer.wg.Add(1)
                go attachOutput(Stderr, opts.Stderr, e.stderr)
        }

        done := make(chan struct{})
        go func() {
                wg.Wait()
                close(done)
        }()
        return done
}

// Cancel cancels exec io.
func (e *ExecIO) Cancel() {
        e.closer.Cancel()
}

// Wait waits exec io to finish.
func (e *ExecIO) Wait() {
        e.closer.Wait()
}

// Close closes all FIFOs.
func (e *ExecIO) Close() error {
        if e.closer != nil {
                e.closer.Close()
        }
        if e.fifos != nil {
                return e.fifos.Close()
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package io

import (
        "context"
        "fmt"
        "io"
        "net/url"
        "os"
        "path/filepath"
        "strings"
        "sync"
        "syscall"
        "time"

        "github.com/containerd/ttrpc"
        "google.golang.org/grpc"
        "google.golang.org/grpc/credentials/insecure"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        streamingapi "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/core/streaming/proxy"
        "github.com/containerd/containerd/v2/core/transfer/streaming"
        "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/containerd/v2/pkg/shim"
)

// AttachOptions specifies how to attach to a container.
type AttachOptions struct {
        Stdin     io.Reader
        Stdout    io.WriteCloser
        Stderr    io.WriteCloser
        Tty       bool
        StdinOnce bool
        // CloseStdin is the function to close container stdin.
        CloseStdin func() error
}

// StreamType is the type of the stream, stdout/stderr.
type StreamType string

const (
        // Stdin stream type.
        Stdin StreamType = "stdin"
        // Stdout stream type.
        Stdout = StreamType(runtime.Stdout)
        // Stderr stream type.
        Stderr = StreamType(runtime.Stderr)
)

type wgCloser struct {
        ctx    context.Context
        wg     *sync.WaitGroup
        set    []io.Closer
        cancel context.CancelFunc
}

func (g *wgCloser) Wait() {
        g.wg.Wait()
}

func (g *wgCloser) Close() {
        for _, f := range g.set {
                f.Close()
        }
}

func (g *wgCloser) Cancel() {
        g.cancel()
}

// newFifos creates fifos directory for a container.
func newFifos(root, id string, tty, stdin bool) (*cio.FIFOSet, error) {
        root = filepath.Join(root, "io")
        if err := os.MkdirAll(root, 0700); err != nil {
                return nil, err
        }
        fifos, err := cio.NewFIFOSetInDir(root, id, tty)
        if err != nil {
                return nil, err
        }
        if !stdin {
                fifos.Stdin = ""
        }
        return fifos, nil
}

// newStreams init streams for io of container.
func newStreams(address, id string, tty, stdin bool) (*cio.FIFOSet, error) {
        fifos := cio.NewFIFOSet(cio.Config{}, func() error { return nil })
        if stdin {
                streamID := id + "-stdin"
                fifos.Stdin = fmt.Sprintf("%s?streaming_id=%s", address, streamID)
        }
        stdoutStreamID := id + "-stdout"
        fifos.Stdout = fmt.Sprintf("%s?streaming_id=%s", address, stdoutStreamID)
        if !tty {
                stderrStreamID := id + "-stderr"
                fifos.Stderr = fmt.Sprintf("%s?streaming_id=%s", address, stderrStreamID)
        }
        fifos.Terminal = tty
        return fifos, nil
}

type stdioStream struct {
        stdin  io.WriteCloser
        stdout io.ReadCloser
        stderr io.ReadCloser
}

// newStdioStream creates actual streams or fifos for stdio.
func newStdioStream(fifos *cio.FIFOSet) (_ *stdioStream, _ *wgCloser, err error) {
        var (
                set         []io.Closer
                ctx, cancel = context.WithCancel(context.Background())
                p           = &stdioStream{}
        )
        defer func() {
                if err != nil {
                        for _, f := range set {
                                f.Close()
                        }
                        cancel()
                }
        }()

        if fifos.Stdin != "" {
                in, err := openStdin(ctx, fifos.Stdin)
                if err != nil {
                        return nil, nil, fmt.Errorf("failed to open stdin, %w", err)
                }
                p.stdin = in
                set = append(set, in)
        }

        if fifos.Stdout != "" {
                out, err := openOutput(ctx, fifos.Stdout)
                if err != nil {
                        return nil, nil, fmt.Errorf("failed to open stdout, %w", err)
                }
                p.stdout = out
                set = append(set, out)
        }

        if fifos.Stderr != "" {
                out, err := openOutput(ctx, fifos.Stderr)
                if err != nil {
                        return nil, nil, fmt.Errorf("failed to open stderr, %w", err)
                }
                p.stderr = out
                set = append(set, out)
        }

        return p, &wgCloser{
                wg:     &sync.WaitGroup{},
                set:    set,
                ctx:    ctx,
                cancel: cancel,
        }, nil
}

func openStdin(ctx context.Context, url string) (io.WriteCloser, error) {
        ok := strings.Contains(url, "://")
        if !ok {
                return openPipe(ctx, url, syscall.O_WRONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700)
        }

        return openStdinStream(ctx, url)
}

func openStdinStream(ctx context.Context, url string) (io.WriteCloser, error) {
        stream, err := openStream(ctx, url)
        if err != nil {
                return nil, err
        }
        return streaming.WriteByteStream(ctx, stream), nil
}

func openOutput(ctx context.Context, url string) (io.ReadCloser, error) {
        ok := strings.Contains(url, "://")
        if !ok {
                return openPipe(ctx, url, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700)
        }

        return openOutputStream(ctx, url)
}

func openOutputStream(ctx context.Context, url string) (io.ReadCloser, error) {
        stream, err := openStream(ctx, url)
        if err != nil {
                return nil, err
        }
        return streaming.ReadByteStream(ctx, stream), nil
}

func openStream(ctx context.Context, urlStr string) (streamingapi.Stream, error) {
        // urlStr should be in the form of:
        // <ttrpc|grpc>+<unix|vsock|hvsock>://<uds-path|vsock-cid:vsock-port|uds-path:hvsock-port>?streaming_id=<stream-id>
        u, err := url.Parse(urlStr)
        if err != nil {
                return nil, fmt.Errorf("address url parse error: %v", err)
        }
        // The address returned from sandbox controller should be in the form like ttrpc+unix://<uds-path>
        // or grpc+vsock://<cid>:<port>, we should get the protocol from the url first.
        protocol, scheme, ok := strings.Cut(u.Scheme, "+")
        if !ok {
                return nil, fmt.Errorf("the scheme of sandbox address should be in " +
                        " the form of <protocol>+<unix|vsock|tcp>, i.e. ttrpc+unix or grpc+vsock")
        }

        id := u.Query().Get("streaming_id")
        if id == "" {
                return nil, fmt.Errorf("no stream id in url queries")
        }
        realAddress := fmt.Sprintf("%s://%s/%s", scheme, u.Host, u.Path)
        conn, err := shim.AnonReconnectDialer(realAddress, 100*time.Second)
        if err != nil {
                return nil, fmt.Errorf("failed to connect the stream %v", err)
        }
        var stream streamingapi.Stream

        switch protocol {
        case "ttrpc":
                c := ttrpc.NewClient(conn)
                streamCreator := proxy.NewStreamCreator(c)
                stream, err = streamCreator.Create(ctx, id)
                if err != nil {
                        return nil, err
                }
                return stream, nil

        case "grpc":
                ctx, cancel := context.WithTimeout(ctx, time.Second*100)
                defer cancel()

                gopts := []grpc.DialOption{
                        grpc.WithTransportCredentials(insecure.NewCredentials()),
                        grpc.WithBlock(),
                }
                conn, err := grpc.DialContext(ctx, realAddress, gopts...)
                if err != nil {
                        return nil, err
                }
                streamCreator := proxy.NewStreamCreator(conn)
                stream, err = streamCreator.Create(ctx, id)
                if err != nil {
                        return nil, err
                }
                return stream, nil
        default:
                return nil, fmt.Errorf("protocol not supported")
        }
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package io

import (
        "context"
        "io"
        "os"

        "github.com/containerd/fifo"
)

func openPipe(ctx context.Context, fn string, flag int, perm os.FileMode) (io.ReadWriteCloser, error) {
        return fifo.OpenFifo(ctx, fn, flag, perm)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package io

import (
        "bufio"
        "bytes"
        "fmt"
        "io"
        "time"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        cioutil "github.com/containerd/containerd/v2/pkg/ioutil"
        "github.com/containerd/log"
)

const (
        // delimiter used in CRI logging format.
        delimiter = ' '
        // eof is end-of-line.
        eol = '\n'
        // timestampFormat is the timestamp format used in CRI logging format.
        timestampFormat = time.RFC3339Nano
        // defaultBufSize is the default size of the read buffer in bytes.
        defaultBufSize = 4096
)

// NewDiscardLogger creates logger which discards all the input.
func NewDiscardLogger() io.WriteCloser {
        return cioutil.NewNopWriteCloser(io.Discard)
}

// NewCRILogger returns a write closer which redirect container log into
// log file, and decorate the log line into CRI defined format. It also
// returns a channel which indicates whether the logger is stopped.
// maxLen is the max length limit of a line. A line longer than the
// limit will be cut into multiple lines.
func NewCRILogger(path string, w io.Writer, stream StreamType, maxLen int) (io.WriteCloser, <-chan struct{}) {
        log.L.Debugf("Start writing stream %q to log file %q", stream, path)
        prc, pwc := io.Pipe()
        stop := make(chan struct{})
        go func() {
                redirectLogs(path, prc, w, stream, maxLen)
                close(stop)
        }()
        return pwc, stop
}

// bufio.ReadLine in golang eats both read errors and tailing newlines
// (See https://golang.org/pkg/bufio/#Reader.ReadLine). When reading
// to io.EOF, it is impossible for the caller to figure out whether
// there is a newline at the end, for example:
// 1) When reading "CONTENT\n", it returns "CONTENT" without error;
// 2) When reading "CONTENT", it also returns "CONTENT" without error.
//
// To differentiate these 2 cases, we need to write a readLine function
// ourselves to not ignore the error.
//
// The code is similar with https://golang.org/src/bufio/bufio.go?s=9537:9604#L359.
// The only difference is that it returns all errors from `ReadSlice`.
//
// readLine returns err != nil if and only if line does not end with a new line.
func readLine(b *bufio.Reader) (line []byte, isPrefix bool, err error) {
        line, err = b.ReadSlice('\n')
        if err == bufio.ErrBufferFull {
                // Handle the case where "\r\n" straddles the buffer.
                if len(line) > 0 && line[len(line)-1] == '\r' {
                        // Unread the last '\r'
                        if err := b.UnreadByte(); err != nil {
                                panic(fmt.Sprintf("invalid unread %v", err))
                        }
                        line = line[:len(line)-1]
                }
                return line, true, nil
        }

        if len(line) == 0 {
                if err != nil {
                        line = nil
                }
                return
        }

        if line[len(line)-1] == '\n' {
                // "ReadSlice returns err != nil if and only if line does not end in delim"
                // (See https://golang.org/pkg/bufio/#Reader.ReadSlice).
                if err != nil {
                        panic(fmt.Sprintf("full read with unexpected error %v", err))
                }
                drop := 1
                if len(line) > 1 && line[len(line)-2] == '\r' {
                        drop = 2
                }
                line = line[:len(line)-drop]
        }
        return
}

func redirectLogs(path string, rc io.ReadCloser, w io.Writer, s StreamType, maxLen int) {
        defer rc.Close()
        var (
                stream    = []byte(s)
                delimiter = []byte{delimiter}
                partial   = []byte(runtime.LogTagPartial)
                full      = []byte(runtime.LogTagFull)
                buf       [][]byte
                length    int
                bufSize   = defaultBufSize

                timeBuffer = make([]byte, len(timestampFormat))
                lineBuffer = bytes.Buffer{}
        )
        // Make sure bufSize <= maxLen
        if maxLen > 0 && maxLen < bufSize {
                bufSize = maxLen
        }
        r := bufio.NewReaderSize(rc, bufSize)
        writeLineBuffer := func(tag []byte, lineBytes [][]byte) {
                timeBuffer = time.Now().AppendFormat(timeBuffer[:0], timestampFormat)
                headers := [][]byte{timeBuffer, stream, tag}

                lineBuffer.Reset()
                for _, h := range headers {
                        lineBuffer.Write(h)
                        lineBuffer.Write(delimiter)
                }
                for _, l := range lineBytes {
                        lineBuffer.Write(l)
                }
                lineBuffer.WriteByte(eol)
                if n, err := lineBuffer.WriteTo(w); err == nil {
                        outputEntries.Inc()
                        outputBytes.Inc(float64(n))
                } else {
                        log.L.WithError(err).Errorf("Fail to write %q log to log file %q", s, path)
                        // Continue on write error to drain the container output.
                }
        }
        for {
                var stop bool
                newLine, isPrefix, err := readLine(r)
                // NOTE(random-liu): readLine can return actual content even if there is an error.
                if len(newLine) > 0 {
                        inputEntries.Inc()
                        inputBytes.Inc(float64(len(newLine)))
                        // Buffer returned by ReadLine will change after
                        // next read, copy it.
                        l := make([]byte, len(newLine))
                        copy(l, newLine)
                        buf = append(buf, l)
                        length += len(l)
                }
                if err != nil {
                        if err == io.EOF {
                                log.L.Tracef("Getting EOF from stream %q while redirecting to log file %q", s, path)
                        } else {
                                log.L.WithError(err).Errorf("An error occurred when redirecting stream %q to log file %q", s, path)
                        }
                        if length == 0 {
                                // No content left to write, break.
                                break
                        }
                        // Stop after writing the content left in buffer.
                        stop = true
                }
                if maxLen > 0 && length > maxLen {
                        exceedLen := length - maxLen
                        last := buf[len(buf)-1]
                        if exceedLen > len(last) {
                                // exceedLen must <= len(last), or else the buffer
                                // should have be written in the previous iteration.
                                panic("exceed length should <= last buffer size")
                        }
                        buf[len(buf)-1] = last[:len(last)-exceedLen]
                        writeLineBuffer(partial, buf)
                        splitEntries.Inc()
                        buf = [][]byte{last[len(last)-exceedLen:]}
                        length = exceedLen
                }
                if isPrefix {
                        continue
                }
                if stop {
                        // readLine only returns error when the message doesn't
                        // end with a newline, in that case it should be treated
                        // as a partial line.
                        writeLineBuffer(partial, buf)
                } else {
                        writeLineBuffer(full, buf)
                }
                buf = nil
                length = 0
                if stop {
                        break
                }
        }
        log.L.Debugf("Finish redirecting stream %q to log file %q", s, path)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package io

import "github.com/docker/go-metrics"

var (
        inputEntries  metrics.Counter
        outputEntries metrics.Counter
        inputBytes    metrics.Counter
        outputBytes   metrics.Counter
        splitEntries  metrics.Counter
)

func init() {
        // These CRI metrics record input and output logging volume.
        ns := metrics.NewNamespace("containerd", "cri", nil)

        inputEntries = ns.NewCounter("input_entries", "Number of log entries received")
        outputEntries = ns.NewCounter("output_entries", "Number of log entries successfully written to disk")
        inputBytes = ns.NewCounter("input_bytes", "Size of logs received")
        outputBytes = ns.NewCounter("output_bytes", "Size of logs successfully written to disk")
        splitEntries = ns.NewCounter("split_entries", "Number of extra log entries created by splitting the "+
                "original log entry. This happens when the original log entry exceeds length limit. "+
                "This metric does not count the original log entry.")

        metrics.Register(ns)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        "context"
        "encoding/json"
        "fmt"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/internal/cri/annotations"
        "github.com/containerd/containerd/v2/internal/cri/constants"
        cstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        sstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/blockio"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/runtime-tools/generate"
        cri "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/internal/nri"
        "github.com/containerd/nri/pkg/api"
        nrigen "github.com/containerd/nri/pkg/runtime-tools/generate"
)

type API struct {
        cri CRIImplementation
        nri nri.API
}

func NewAPI(nri nri.API) *API {
        return &API{
                nri: nri,
        }
}

func (a *API) IsDisabled() bool {
        return a == nil || a.nri == nil || !a.nri.IsEnabled()
}

func (a *API) IsEnabled() bool { return !a.IsDisabled() }

func (a *API) Register(cri CRIImplementation) error {
        if a.IsDisabled() {
                return nil
        }

        a.cri = cri
        nri.RegisterDomain(a)

        return a.nri.Start()
}

//
// CRI-NRI lifecycle hook interface
//
// These functions are used to hook NRI into the processing of
// the corresponding CRI lifecycle events using the common NRI
// interface.
//

func (a *API) RunPodSandbox(ctx context.Context, criPod *sstore.Sandbox) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        err := a.nri.RunPodSandbox(ctx, pod)

        if err != nil {
                a.nri.StopPodSandbox(ctx, pod)
                a.nri.RemovePodSandbox(ctx, pod)
        }

        return err
}

func (a *API) StopPodSandbox(ctx context.Context, criPod *sstore.Sandbox) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        err := a.nri.StopPodSandbox(ctx, pod)

        return err
}

func (a *API) RemovePodSandbox(ctx context.Context, criPod *sstore.Sandbox) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)

        err := a.nri.RemovePodSandbox(ctx, pod)

        return err
}

func (a *API) CreateContainer(ctx context.Context, ctrs *containers.Container, spec *runtimespec.Spec) (*api.ContainerAdjustment, error) {
        ctr := a.nriContainer(ctrs, spec)

        criPod, err := a.cri.SandboxStore().Get(ctr.GetPodSandboxID())
        if err != nil {
                return nil, err
        }

        pod := a.nriPodSandbox(&criPod)

        adjust, err := a.nri.CreateContainer(ctx, pod, ctr)

        return adjust, err
}

func (a *API) PostCreateContainer(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(criCtr, nil)

        err := a.nri.PostCreateContainer(ctx, pod, ctr)

        return err
}

func (a *API) StartContainer(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(criCtr, nil)

        err := a.nri.StartContainer(ctx, pod, ctr)

        return err
}

func (a *API) PostStartContainer(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(criCtr, nil)

        err := a.nri.PostStartContainer(ctx, pod, ctr)

        return err
}

func (a *API) UpdateContainerResources(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container, req *cri.LinuxContainerResources) (*cri.LinuxContainerResources, error) {
        if a.IsDisabled() {
                return nil, nil
        }

        const noOomAdj = 0

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(criCtr, nil)

        r, err := a.nri.UpdateContainer(ctx, pod, ctr, api.FromCRILinuxResources(req))
        if err != nil {
                return nil, err
        }

        return r.ToCRI(noOomAdj), nil
}

func (a *API) PostUpdateContainerResources(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(criCtr, nil)

        err := a.nri.PostUpdateContainer(ctx, pod, ctr)

        return err
}

func (a *API) StopContainer(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container) error {
        if a.IsDisabled() {
                return nil
        }

        ctr := a.nriContainer(criCtr, nil)

        if criPod == nil || criPod.ID == "" {
                criPod = &sstore.Sandbox{
                        Metadata: sstore.Metadata{
                                ID: ctr.GetPodSandboxID(),
                        },
                }
        }
        pod := a.nriPodSandbox(criPod)

        err := a.nri.StopContainer(ctx, pod, ctr)

        return err
}

func (a *API) NotifyContainerExit(ctx context.Context, criCtr *cstore.Container) {
        if a.IsDisabled() {
                return
        }

        ctr := a.nriContainer(criCtr, nil)

        criPod, _ := a.cri.SandboxStore().Get(ctr.GetPodSandboxID())
        if criPod.ID == "" {
                criPod = sstore.Sandbox{
                        Metadata: sstore.Metadata{
                                ID: ctr.GetPodSandboxID(),
                        },
                }
        }
        pod := a.nriPodSandbox(&criPod)

        a.nri.NotifyContainerExit(ctx, pod, ctr)
}

func (a *API) RemoveContainer(ctx context.Context, criPod *sstore.Sandbox, criCtr *cstore.Container) error {
        if a.IsDisabled() {
                return nil
        }

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(criCtr, nil)

        err := a.nri.RemoveContainer(ctx, pod, ctr)

        return err
}

func (a *API) UndoCreateContainer(ctx context.Context, criPod *sstore.Sandbox, id string, spec *runtimespec.Spec) {
        if a.IsDisabled() {
                return
        }

        pod := a.nriPodSandbox(criPod)
        ctr := a.nriContainer(&containers.Container{ID: id}, spec)

        err := a.nri.StopContainer(ctx, pod, ctr)
        if err != nil {
                log.G(ctx).WithError(err).Error("container creation undo (stop) failed")
        }

        err = a.nri.RemoveContainer(ctx, pod, ctr)
        if err != nil {
                log.G(ctx).WithError(err).Error("container creation undo (remove) failed")
        }
}

func (a *API) WithContainerAdjustment() containerd.NewContainerOpts {
        if a.IsDisabled() {
                return func(context.Context, *containerd.Client, *containers.Container) error {
                        return nil
                }
        }

        resourceCheckOpt := nrigen.WithResourceChecker(
                func(r *runtimespec.LinuxResources) error {
                        if r != nil {
                                if a.cri.Config().DisableHugetlbController {
                                        r.HugepageLimits = nil
                                }
                        }
                        return nil
                },
        )

        rdtResolveOpt := nrigen.WithRdtResolver(
                func(className string) (*runtimespec.LinuxIntelRdt, error) {
                        if className == "" {
                                return nil, nil
                        }
                        return &runtimespec.LinuxIntelRdt{
                                ClosID: className,
                        }, nil
                },
        )

        blkioResolveOpt := nrigen.WithBlockIOResolver(
                func(className string) (*runtimespec.LinuxBlockIO, error) {
                        if className == "" {
                                return nil, nil
                        }
                        blockIO, err := blockio.ClassNameToLinuxOCI(className)
                        if err != nil {
                                return nil, err
                        }
                        return blockIO, nil
                },
        )

        return func(ctx context.Context, _ *containerd.Client, c *containers.Container) error {
                spec := &runtimespec.Spec{}
                if err := json.Unmarshal(c.Spec.GetValue(), spec); err != nil {
                        return fmt.Errorf("failed to unmarshal container OCI Spec for NRI: %w", err)
                }

                adjust, err := a.CreateContainer(ctx, c, spec)
                if err != nil {
                        return fmt.Errorf("failed to get NRI adjustment for container: %w", err)
                }

                sgen := generate.Generator{Config: spec}
                ngen := nrigen.SpecGenerator(&sgen, resourceCheckOpt, rdtResolveOpt, blkioResolveOpt)

                err = ngen.Adjust(adjust)
                if err != nil {
                        return fmt.Errorf("failed to NRI-adjust container Spec: %w", err)
                }

                adjusted, err := typeurl.MarshalAny(spec)
                if err != nil {
                        return fmt.Errorf("failed to marshal NRI-adjusted Spec: %w", err)
                }

                c.Spec = adjusted
                return nil
        }
}

func (a *API) WithContainerExit(criCtr *cstore.Container) containerd.ProcessDeleteOpts {
        if a.IsDisabled() {
                return func(_ context.Context, _ containerd.Process) error {
                        return nil
                }
        }

        return func(_ context.Context, _ containerd.Process) error {
                a.NotifyContainerExit(context.Background(), criCtr)
                return nil
        }
}

//
// NRI-CRI 'domain' interface
//
// These functions are used to interface CRI pods and containers
// from the common NRI interface. They implement pod and container
// discovery, lookup and updating of container parameters.
//

const (
        nriDomain = constants.K8sContainerdNamespace
)

func (a *API) GetName() string {
        return nriDomain
}

func (a *API) ListPodSandboxes() []nri.PodSandbox {
        pods := []nri.PodSandbox{}
        for _, pod := range a.cri.SandboxStore().List() {
                if pod.Status.Get().State != sstore.StateUnknown {
                        pod := pod
                        pods = append(pods, a.nriPodSandbox(&pod))
                }
        }
        return pods
}

func (a *API) ListContainers() []nri.Container {
        containers := []nri.Container{}
        for _, ctr := range a.cri.ContainerStore().List() {
                switch ctr.Status.Get().State() {
                case cri.ContainerState_CONTAINER_EXITED:
                        continue
                case cri.ContainerState_CONTAINER_UNKNOWN:
                        continue
                }
                ctr := ctr
                containers = append(containers, a.nriContainer(&ctr, nil))
        }
        return containers
}

func (a *API) GetPodSandbox(id string) (nri.PodSandbox, bool) {
        pod, err := a.cri.SandboxStore().Get(id)
        if err != nil {
                return nil, false
        }

        return a.nriPodSandbox(&pod), true
}

func (a *API) GetContainer(id string) (nri.Container, bool) {
        ctr, err := a.cri.ContainerStore().Get(id)
        if err != nil {
                return nil, false
        }

        return a.nriContainer(&ctr, nil), true
}

func (a *API) UpdateContainer(ctx context.Context, u *api.ContainerUpdate) error {
        ctr, err := a.cri.ContainerStore().Get(u.ContainerId)
        if err != nil {
                return nil
        }

        err = ctr.Status.UpdateSync(
                func(status cstore.Status) (cstore.Status, error) {
                        criReq := &cri.UpdateContainerResourcesRequest{
                                ContainerId: u.ContainerId,
                                Linux:       u.GetLinux().GetResources().ToCRI(0),
                        }
                        newStatus, err := a.cri.UpdateContainerResources(ctx, ctr, criReq, status)
                        return newStatus, err
                },
        )
        if err != nil {
                if !u.IgnoreFailure {
                        return err
                }
        }

        return nil
}

func (a *API) EvictContainer(ctx context.Context, e *api.ContainerEviction) error {
        ctr, err := a.cri.ContainerStore().Get(e.ContainerId)
        if err != nil {
                return nil
        }
        err = a.cri.StopContainer(ctx, ctr, 0)
        if err != nil {
                return err
        }

        return nil
}

//
// NRI integration wrapper for CRI Pods
//

type criPodSandbox struct {
        *sstore.Sandbox
        spec *runtimespec.Spec
        pid  uint32
}

func (a *API) nriPodSandbox(pod *sstore.Sandbox) *criPodSandbox {
        criPod := &criPodSandbox{
                Sandbox: pod,
                spec:    &runtimespec.Spec{},
        }

        if pod == nil || pod.Container == nil {
                return criPod
        }

        ctx := ctrdutil.NamespacedContext()
        task, err := pod.Container.Task(ctx, nil)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        log.L.WithError(err).Errorf("failed to get task for sandbox container %s",
                                pod.Container.ID())
                }
                return criPod
        }

        criPod.pid = task.Pid()
        spec, err := task.Spec(ctx)
        if err != nil {
                log.L.WithError(err).Errorf("failed to get spec for sandbox container %s",
                        pod.Container.ID())
                return criPod
        }
        criPod.spec = spec

        return criPod
}

func (p *criPodSandbox) GetDomain() string {
        return nriDomain
}

func (p *criPodSandbox) GetID() string {
        if p.Sandbox == nil {
                return ""
        }
        return p.ID
}

func (p *criPodSandbox) GetName() string {
        if p.Sandbox == nil {
                return ""
        }
        return p.Config.GetMetadata().GetName()
}

func (p *criPodSandbox) GetUID() string {
        if p.Sandbox == nil {
                return ""
        }
        return p.Config.GetMetadata().GetUid()
}

func (p *criPodSandbox) GetNamespace() string {
        if p.Sandbox == nil {
                return ""
        }
        return p.Config.GetMetadata().GetNamespace()
}

func (p *criPodSandbox) GetAnnotations() map[string]string {
        if p.Sandbox == nil {
                return nil
        }

        annotations := map[string]string{}

        for key, value := range p.Config.GetAnnotations() {
                annotations[key] = value
        }
        for key, value := range p.spec.Annotations {
                annotations[key] = value
        }

        return annotations
}

func (p *criPodSandbox) GetLabels() map[string]string {
        if p.Sandbox == nil {
                return nil
        }

        labels := map[string]string{}

        for key, value := range p.Config.GetLabels() {
                labels[key] = value
        }

        if p.Sandbox.Container == nil {
                return labels
        }

        ctx := ctrdutil.NamespacedContext()
        ctrd := p.Sandbox.Container
        ctrs, err := ctrd.Info(ctx, containerd.WithoutRefreshedMetadata)
        if err != nil {
                log.L.WithError(err).Errorf("failed to get info for sandbox container %s", ctrd.ID())
                return labels
        }

        for key, value := range ctrs.Labels {
                labels[key] = value
        }

        return labels
}

func (p *criPodSandbox) GetRuntimeHandler() string {
        if p.Sandbox == nil {
                return ""
        }
        return p.RuntimeHandler
}

func (p *criPodSandbox) GetLinuxPodSandbox() nri.LinuxPodSandbox {
        return p
}

func (p *criPodSandbox) GetLinuxNamespaces() []*api.LinuxNamespace {
        if p.spec.Linux != nil {
                return api.FromOCILinuxNamespaces(p.spec.Linux.Namespaces)
        }
        return nil
}

func (p *criPodSandbox) GetPodLinuxOverhead() *api.LinuxResources {
        if p.Sandbox == nil {
                return nil
        }
        return api.FromCRILinuxResources(p.Config.GetLinux().GetOverhead())
}
func (p *criPodSandbox) GetPodLinuxResources() *api.LinuxResources {
        if p.Sandbox == nil {
                return nil
        }
        return api.FromCRILinuxResources(p.Config.GetLinux().GetResources())
}

func (p *criPodSandbox) GetLinuxResources() *api.LinuxResources {
        if p.spec.Linux == nil {
                return nil
        }
        return api.FromOCILinuxResources(p.spec.Linux.Resources, nil)
}

func (p *criPodSandbox) GetCgroupParent() string {
        if p.Sandbox == nil {
                return ""
        }
        return p.Config.GetLinux().GetCgroupParent()
}

func (p *criPodSandbox) GetCgroupsPath() string {
        if p.spec.Linux == nil {
                return ""
        }
        return p.spec.Linux.CgroupsPath
}

func (p *criPodSandbox) GetPid() uint32 {
        return p.pid
}

//
// NRI integration wrapper for CRI Containers
//

type criContainer struct {
        api  *API
        ctrs *containers.Container
        spec *runtimespec.Spec
        meta *cstore.Metadata
        pid  uint32
}

func (a *API) nriContainer(ctr interface{}, spec *runtimespec.Spec) *criContainer {
        switch c := ctr.(type) {
        case *cstore.Container:
                ctx := ctrdutil.NamespacedContext()
                pid := uint32(0)
                ctrd := c.Container
                ctrs, err := ctrd.Info(ctx, containerd.WithoutRefreshedMetadata)
                if err != nil {
                        log.L.WithError(err).Errorf("failed to get info for container %s", ctrd.ID())
                }
                spec, err := ctrd.Spec(ctx)
                if err != nil {
                        log.L.WithError(err).Errorf("failed to get OCI Spec for container %s", ctrd.ID())
                        spec = &runtimespec.Spec{}
                }
                task, err := ctrd.Task(ctx, nil)
                if err != nil {
                        if !errdefs.IsNotFound(err) {
                                log.L.WithError(err).Errorf("failed to get task for container %s", ctrd.ID())
                        }
                } else {
                        pid = task.Pid()
                }

                return &criContainer{
                        api:  a,
                        ctrs: &ctrs,
                        meta: &c.Metadata,
                        spec: spec,
                        pid:  pid,
                }

        case *containers.Container:
                ctrs := c
                meta := &cstore.Metadata{}
                if ext := ctrs.Extensions[a.cri.ContainerMetadataExtensionKey()]; ext != nil {
                        err := typeurl.UnmarshalTo(ext, meta)
                        if err != nil {
                                log.L.WithError(err).Errorf("failed to get metadata for container %s", ctrs.ID)
                        }
                }

                return &criContainer{
                        api:  a,
                        ctrs: ctrs,
                        meta: meta,
                        spec: spec,
                }
        }

        log.L.Errorf("can't wrap %T as NRI container", ctr)
        return &criContainer{
                api:  a,
                meta: &cstore.Metadata{},
                spec: &runtimespec.Spec{},
        }
}

func (c *criContainer) GetDomain() string {
        return nriDomain
}

func (c *criContainer) GetID() string {
        if c.ctrs != nil {
                return c.ctrs.ID
        }
        return ""
}

func (c *criContainer) GetPodSandboxID() string {
        return c.spec.Annotations[annotations.SandboxID]
}

func (c *criContainer) GetName() string {
        return c.spec.Annotations[annotations.ContainerName]
}

func (c *criContainer) GetState() api.ContainerState {
        criCtr, err := c.api.cri.ContainerStore().Get(c.GetID())
        if err != nil {
                return api.ContainerState_CONTAINER_UNKNOWN
        }
        switch criCtr.Status.Get().State() {
        case cri.ContainerState_CONTAINER_CREATED:
                return api.ContainerState_CONTAINER_CREATED
        case cri.ContainerState_CONTAINER_RUNNING:
                return api.ContainerState_CONTAINER_RUNNING
        case cri.ContainerState_CONTAINER_EXITED:
                return api.ContainerState_CONTAINER_STOPPED
        }

        return api.ContainerState_CONTAINER_UNKNOWN
}

func (c *criContainer) GetLabels() map[string]string {
        if c.ctrs == nil {
                return nil
        }

        labels := map[string]string{}
        for key, value := range c.ctrs.Labels {
                labels[key] = value
        }

        if c.meta != nil && c.meta.Config != nil {
                for key, value := range c.meta.Config.Labels {
                        labels[key] = value
                }
        }

        return labels
}

func (c *criContainer) GetAnnotations() map[string]string {
        annotations := map[string]string{}

        for key, value := range c.spec.Annotations {
                annotations[key] = value
        }
        if c.meta != nil && c.meta.Config != nil {
                for key, value := range c.meta.Config.Annotations {
                        annotations[key] = value
                }
        }

        return annotations
}

func (c *criContainer) GetArgs() []string {
        if c.spec.Process == nil {
                return nil
        }
        return api.DupStringSlice(c.spec.Process.Args)
}

func (c *criContainer) GetEnv() []string {
        if c.spec.Process == nil {
                return nil
        }
        return api.DupStringSlice(c.spec.Process.Env)
}

func (c *criContainer) GetMounts() []*api.Mount {
        return api.FromOCIMounts(c.spec.Mounts)
}

func (c *criContainer) GetHooks() *api.Hooks {
        return api.FromOCIHooks(c.spec.Hooks)
}

func (c *criContainer) GetLinuxContainer() nri.LinuxContainer {
        return c
}

func (c *criContainer) GetLinuxNamespaces() []*api.LinuxNamespace {
        if c.spec.Linux == nil {
                return nil
        }
        return api.FromOCILinuxNamespaces(c.spec.Linux.Namespaces)
}

func (c *criContainer) GetLinuxDevices() []*api.LinuxDevice {
        if c.spec.Linux == nil {
                return nil
        }
        return api.FromOCILinuxDevices(c.spec.Linux.Devices)
}

func (c *criContainer) GetLinuxResources() *api.LinuxResources {
        if c.spec.Linux == nil {
                return nil
        }
        return api.FromOCILinuxResources(c.spec.Linux.Resources, c.spec.Annotations)
}

func (c *criContainer) GetOOMScoreAdj() *int {
        if c.spec.Process == nil {
                return nil
        }
        return c.spec.Process.OOMScoreAdj
}

func (c *criContainer) GetCgroupsPath() string {
        if c.spec.Linux == nil {
                return ""
        }
        return c.spec.Linux.CgroupsPath
}

func (c *criContainer) GetPid() uint32 {
        return c.pid
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"
        "errors"
        "fmt"
        "os"
        "strings"

        "github.com/containerd/continuity/fs"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

// WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the
// snapshot fails we make sure the image is actually unpacked and retry.
func WithNewSnapshot(id string, i containerd.Image, opts ...snapshots.Opt) containerd.NewContainerOpts {
        f := containerd.WithNewSnapshot(id, i, opts...)
        return func(ctx context.Context, client *containerd.Client, c *containers.Container) error {
                if err := f(ctx, client, c); err != nil {
                        if !errdefs.IsNotFound(err) {
                                return err
                        }

                        if err := i.Unpack(ctx, c.Snapshotter); err != nil {
                                return fmt.Errorf("error unpacking image: %w", err)
                        }
                        return f(ctx, client, c)
                }
                return nil
        }
}

// WithVolumes copies ownership of volume in rootfs to its corresponding host path.
// It doesn't update runtime spec.
// The passed in map is a host path to container path map for all volumes.
func WithVolumes(volumeMounts map[string]string, platform imagespec.Platform) containerd.NewContainerOpts {
        return func(ctx context.Context, client *containerd.Client, c *containers.Container) (err error) {
                if c.Snapshotter == "" {
                        return errors.New("no snapshotter set for container")
                }
                if c.SnapshotKey == "" {
                        return errors.New("rootfs not created for container")
                }
                snapshotter := client.SnapshotService(c.Snapshotter)
                mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
                if err != nil {
                        return err
                }
                // Since only read is needed, append ReadOnly mount option to prevent linux kernel
                // from syncing whole filesystem in umount syscall.
                if len(mounts) == 1 && mounts[0].Type == "overlay" {
                        mounts[0].Options = append(mounts[0].Options, "ro")
                }

                root, err := os.MkdirTemp("", "ctd-volume")
                if err != nil {
                        return err
                }
                // We change RemoveAll to Remove so that we either leak a temp dir
                // if it fails but not RM snapshot data.
                // refer to https://github.com/containerd/containerd/pull/1868
                // https://github.com/containerd/containerd/pull/1785
                defer os.Remove(root)

                if err := mount.All(mounts, root); err != nil {
                        return fmt.Errorf("failed to mount: %w", err)
                }
                defer func() {
                        if uerr := mount.Unmount(root, 0); uerr != nil {
                                log.G(ctx).WithError(uerr).Errorf("Failed to unmount snapshot %q", root)
                                if err == nil {
                                        err = uerr
                                }
                        }
                }()

                for host, volume := range volumeMounts {
                        if platform.OS == "windows" {
                                // Windows allows volume mounts in subfolders under C: and as any other drive letter like D:, E:, etc.
                                // An image may contain files inside a folder defined as a VOLUME in a Dockerfile. On Windows, images
                                // can only contain pre-existing files for volumes situated on the root filesystem, which is C:.
                                // For any other volumes, we need to skip attempting to copy existing contents.
                                //
                                // C:\some\volume --> \some\volume
                                // D:\some\volume --> skip
                                if len(volume) >= 2 && string(volume[1]) == ":" {
                                        // Perform a case insensitive comparison to "C", and skip non-C mounted volumes.
                                        if !strings.EqualFold(string(volume[0]), "c") {
                                                continue
                                        }
                                        // This is a volume mounted somewhere under C:\. We strip the drive letter and allow fs.RootPath()
                                        // to append the remaining path to the rootfs path as seen by the host OS.
                                        volume = volume[2:]
                                }
                        }
                        src, err := fs.RootPath(root, volume)
                        if err != nil {
                                return fmt.Errorf("rootpath on mountPath %s, volume %s: %w", root, volume, err)
                        }
                        if _, err := os.Stat(src); err != nil {
                                if os.IsNotExist(err) {
                                        // Skip copying directory if it does not exist.
                                        continue
                                }
                                return fmt.Errorf("stat volume in rootfs: %w", err)
                        }
                        if err := copyExistingContents(src, host); err != nil {
                                return fmt.Errorf("taking runtime copy of volume: %w", err)
                        }
                }
                return nil
        }
}

// copyExistingContents copies from the source to the destination and
// ensures the ownership is appropriately set.
func copyExistingContents(source, destination string) error {
        f, err := os.Open(destination)
        if err != nil {
                return err
        }
        defer f.Close()

        dstList, err := f.Readdirnames(-1)
        if err != nil {
                return err
        }
        if len(dstList) != 0 {
                return fmt.Errorf("volume at %q is not initially empty", destination)
        }
        return fs.CopyDir(destination, source, fs.WithXAttrExclude("security.selinux"))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "sort"

        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/oci"
        osinterface "github.com/containerd/containerd/v2/pkg/os"
)

// WithDarwinMounts adds mounts from CRI's container config + extra mounts.
func WithDarwinMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, container *containers.Container, s *oci.Spec) error {
                // mergeMounts merge CRI mounts with extra mounts. If a mount destination
                // is mounted by both a CRI mount and an extra mount, the CRI mount will
                // be kept.
                var (
                        criMounts = config.GetMounts()
                        mounts    = append([]*runtime.Mount{}, criMounts...)
                )

                // Copy all mounts from extra mounts, except for mounts overridden by CRI.
                for _, e := range extra {
                        found := false
                        for _, c := range criMounts {
                                if cleanMount(e.ContainerPath) == cleanMount(c.ContainerPath) {
                                        found = true
                                        break
                                }
                        }
                        if !found {
                                mounts = append(mounts, e)
                        }
                }

                // Sort mounts in number of parts. This ensures that high level mounts don't
                // shadow other mounts.
                sort.Stable(orderedMounts(mounts))

                // Copy all mounts from default mounts, except for
                // - mounts overridden by supplied mount;
                mountSet := make(map[string]struct{})
                for _, m := range mounts {
                        mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
                }

                defaultMounts := s.Mounts
                s.Mounts = nil

                for _, m := range defaultMounts {
                        dst := cleanMount(m.Destination)
                        if _, ok := mountSet[dst]; ok {
                                // filter out mount overridden by a supplied mount
                                continue
                        }
                        s.Mounts = append(s.Mounts, m)
                }

                for _, mount := range mounts {
                        var (
                                dst = mount.GetContainerPath()
                                src = mount.GetHostPath()
                        )

                        // Create the host path if it doesn't exist.
                        if _, err := osi.Stat(src); err != nil {
                                if !os.IsNotExist(err) {
                                        return fmt.Errorf("failed to stat %q: %w", src, err)
                                }
                                if err := osi.MkdirAll(src, 0755); err != nil {
                                        return fmt.Errorf("failed to mkdir %q: %w", src, err)
                                }
                        }

                        src, err := osi.ResolveSymbolicLink(src)
                        if err != nil {
                                return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
                        }

                        var options []string
                        if mount.GetReadonly() {
                                options = append(options, "ro")
                        } else {
                                options = append(options, "rw")
                        }

                        s.Mounts = append(s.Mounts, runtimespec.Mount{
                                Source:      src,
                                Destination: dst,
                                Type:        "bind",
                                Options:     options,
                        })
                }
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strings"
        "sync"
        "syscall"

        "github.com/containerd/cgroups/v3"
        "golang.org/x/sys/unix"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
        "tags.cncf.io/container-device-interface/pkg/cdi"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/log"
)

// Linux dependent OCI spec opts.

var (
        swapControllerAvailability     bool
        swapControllerAvailabilityOnce sync.Once
)

// SwapControllerAvailable returns true if the swap controller is available
func SwapControllerAvailable() bool {
        swapControllerAvailabilityOnce.Do(func() {
                const warn = "Failed to detect the availability of the swap controller, assuming not available"
                p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes"
                if cgroups.Mode() == cgroups.Unified {
                        // memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max
                        _, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup")
                        if err != nil {
                                err = fmt.Errorf("failed to parse /proc/self/cgroup: %w", err)
                                log.L.WithError(err).Warn(warn)
                                return
                        }
                        p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max")
                }
                if _, err := os.Stat(p); err != nil {
                        if !errors.Is(err, os.ErrNotExist) {
                                log.L.WithError(err).Warn(warn)
                        }
                        return
                }
                swapControllerAvailability = true
        })
        return swapControllerAvailability
}

var (
        supportsHugetlbOnce sync.Once
        supportsHugetlb     bool
)

func isHugetlbControllerPresent() bool {
        supportsHugetlbOnce.Do(func() {
                supportsHugetlb = false
                if IsCgroup2UnifiedMode() {
                        supportsHugetlb = cgroupv2HasHugetlb()
                } else {
                        supportsHugetlb = cgroupv1HasHugetlb()
                }
        })
        return supportsHugetlb
}

var (
        _cgroupv1HasHugetlbOnce sync.Once
        _cgroupv1HasHugetlb     bool
        _cgroupv2HasHugetlbOnce sync.Once
        _cgroupv2HasHugetlb     bool
        isUnifiedOnce           sync.Once
        isUnified               bool
)

// cgroupv1HasHugetlb returns whether the hugetlb controller is present on
// cgroup v1.
func cgroupv1HasHugetlb() bool {
        _cgroupv1HasHugetlbOnce.Do(func() {
                if _, err := os.ReadDir("/sys/fs/cgroup/hugetlb"); err != nil {
                        _cgroupv1HasHugetlb = false
                } else {
                        _cgroupv1HasHugetlb = true
                }
        })
        return _cgroupv1HasHugetlb
}

// cgroupv2HasHugetlb returns whether the hugetlb controller is present on
// cgroup v2.
func cgroupv2HasHugetlb() bool {
        _cgroupv2HasHugetlbOnce.Do(func() {
                controllers, err := os.ReadFile("/sys/fs/cgroup/cgroup.controllers")
                if err != nil {
                        return
                }
                _cgroupv2HasHugetlb = strings.Contains(string(controllers), "hugetlb")
        })
        return _cgroupv2HasHugetlb
}

// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
        isUnifiedOnce.Do(func() {
                var st syscall.Statfs_t
                if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil {
                        panic("cannot statfs cgroup root")
                }
                isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
        })
        return isUnified
}

// WithCDI updates OCI spec with CDI content
func WithCDI(annotations map[string]string, CDIDevices []*runtime.CDIDevice) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
                seen := make(map[string]bool)
                // Add devices from CDIDevices CRI field
                var devices []string
                var err error
                for _, device := range CDIDevices {
                        deviceName := device.Name
                        if seen[deviceName] {
                                log.G(ctx).Debugf("Skipping duplicated CDI device %s", deviceName)
                                continue
                        }
                        devices = append(devices, deviceName)
                        seen[deviceName] = true
                }
                log.G(ctx).Infof("Container %v: CDI devices from CRI Config.CDIDevices: %v", c.ID, devices)

                // Add devices from CDI annotations
                _, devsFromAnnotations, err := cdi.ParseAnnotations(annotations)
                if err != nil {
                        return fmt.Errorf("failed to parse CDI device annotations: %w", err)
                }

                if devsFromAnnotations != nil {
                        log.G(ctx).Infof("Container %v: CDI devices from annotations: %v", c.ID, devsFromAnnotations)
                        for _, deviceName := range devsFromAnnotations {
                                if seen[deviceName] {
                                        // TODO: change to Warning when passing CDI devices as annotations is deprecated
                                        log.G(ctx).Debugf("Skipping duplicated CDI device %s", deviceName)
                                        continue
                                }
                                devices = append(devices, deviceName)
                                seen[deviceName] = true
                        }
                        // TODO: change to Warning when passing CDI devices as annotations is deprecated
                        log.G(ctx).Debug("Passing CDI devices as annotations will be deprecated soon, please use CRI CDIDevices instead")
                }

                return oci.WithCDIDevices(devices...)(ctx, client, c, s)
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "strconv"
        "strings"
        "syscall"

        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/selinux/go-selinux/label"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
        crierrors "k8s.io/cri-api/pkg/errors"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/oci"
        osinterface "github.com/containerd/containerd/v2/pkg/os"
        "github.com/containerd/log"
)

// WithMounts sorts and adds runtime and CRI mounts to the spec
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string, handler *runtime.RuntimeHandler) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
                // mergeMounts merge CRI mounts with extra mounts. If a mount destination
                // is mounted by both a CRI mount and an extra mount, the CRI mount will
                // be kept.
                var (
                        criMounts = config.GetMounts()
                        mounts    = append([]*runtime.Mount{}, criMounts...)
                )
                // Copy all mounts from extra mounts, except for mounts overridden by CRI.
                for _, e := range extra {
                        found := false
                        for _, c := range criMounts {
                                if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
                                        found = true
                                        break
                                }
                        }
                        if !found {
                                mounts = append(mounts, e)
                        }
                }

                // Sort mounts in number of parts. This ensures that high level mounts don't
                // shadow other mounts.
                sort.Stable(orderedMounts(mounts))

                // Mount cgroup into the container as readonly, which inherits docker's behavior.
                s.Mounts = append(s.Mounts, runtimespec.Mount{
                        Source:      "cgroup",
                        Destination: "/sys/fs/cgroup",
                        Type:        "cgroup",
                        Options:     []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
                })

                // Copy all mounts from default mounts, except for
                // - mounts overridden by supplied mount;
                // - all mounts under /dev if a supplied /dev is present.
                mountSet := make(map[string]struct{})
                for _, m := range mounts {
                        mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
                }

                defaultMounts := s.Mounts
                s.Mounts = nil

                for _, m := range defaultMounts {
                        dst := filepath.Clean(m.Destination)
                        if _, ok := mountSet[dst]; ok {
                                // filter out mount overridden by a supplied mount
                                continue
                        }
                        if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
                                // filter out everything under /dev if /dev is a supplied mount
                                continue
                        }
                        s.Mounts = append(s.Mounts, m)
                }

                for _, mount := range mounts {
                        var (
                                dst = mount.GetContainerPath()
                                src = mount.GetHostPath()
                        )
                        // Create the host path if it doesn't exist.
                        // TODO(random-liu): Add CRI validation test for this case.
                        if _, err := osi.Stat(src); err != nil {
                                if !os.IsNotExist(err) {
                                        return fmt.Errorf("failed to stat %q: %w", src, err)
                                }
                                if err := osi.MkdirAll(src, 0755); err != nil {
                                        return fmt.Errorf("failed to mkdir %q: %w", src, err)
                                }
                        }
                        // TODO(random-liu): Add cri-containerd integration test or cri validation test
                        // for this.
                        src, err := osi.ResolveSymbolicLink(src)
                        if err != nil {
                                return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
                        }
                        if s.Linux == nil {
                                s.Linux = &runtimespec.Linux{}
                        }
                        options := []string{"rbind"}
                        switch mount.GetPropagation() {
                        case runtime.MountPropagation_PROPAGATION_PRIVATE:
                                options = append(options, "rprivate")
                                // Since default root propagation in runc is rprivate ignore
                                // setting the root propagation
                        case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
                                if err := ensureShared(src, osi.LookupMount); err != nil {
                                        return err
                                }
                                options = append(options, "rshared")
                                s.Linux.RootfsPropagation = "rshared"
                        case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
                                if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
                                        return err
                                }
                                options = append(options, "rslave")
                                if s.Linux.RootfsPropagation != "rshared" &&
                                        s.Linux.RootfsPropagation != "rslave" {
                                        s.Linux.RootfsPropagation = "rslave"
                                }
                        default:
                                log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
                                options = append(options, "rprivate")
                        }

                        // NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
                        // is readonly. This is different from docker's behavior, but make more sense.
                        if mount.GetReadonly() {
                                if mount.GetRecursiveReadOnly() {
                                        if handler == nil || !handler.Features.RecursiveReadOnlyMounts {
                                                return fmt.Errorf("%w: runtime handler does not support recursive read-only mounts (hostPath=%q)",
                                                        crierrors.ErrRROUnsupported, mount.HostPath)
                                        }
                                        if mount.Propagation != runtime.MountPropagation_PROPAGATION_PRIVATE {
                                                return fmt.Errorf("recursive read-only mount needs private propagation, got %q (hostPath=%q)",
                                                        mount.Propagation.String(), mount.HostPath)
                                        }
                                        options = append(options, "rro")
                                } else {
                                        options = append(options, "ro")
                                }
                        } else {
                                if mount.GetRecursiveReadOnly() {
                                        return fmt.Errorf("recursive read-only mount conflicts with RW mount (hostPath=%q)",
                                                mount.HostPath)
                                }
                                options = append(options, "rw")
                        }

                        if mount.GetSelinuxRelabel() {
                                ENOTSUP := syscall.Errno(0x5f) // Linux specific error code, this branch will not execute on non Linux platforms.
                                if err := label.Relabel(src, mountLabel, false); err != nil && err != ENOTSUP {
                                        return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
                                }
                        }

                        var uidMapping []runtimespec.LinuxIDMapping
                        if mount.UidMappings != nil {
                                for _, mapping := range mount.UidMappings {
                                        uidMapping = append(uidMapping, runtimespec.LinuxIDMapping{
                                                HostID:      mapping.HostId,
                                                ContainerID: mapping.ContainerId,
                                                Size:        mapping.Length,
                                        })
                                }
                        }
                        var gidMapping []runtimespec.LinuxIDMapping
                        if mount.GidMappings != nil {
                                for _, mapping := range mount.GidMappings {
                                        gidMapping = append(gidMapping, runtimespec.LinuxIDMapping{
                                                HostID:      mapping.HostId,
                                                ContainerID: mapping.ContainerId,
                                                Size:        mapping.Length,
                                        })
                                }
                        }

                        s.Mounts = append(s.Mounts, runtimespec.Mount{
                                Source:      src,
                                Destination: dst,
                                Type:        "bind",
                                Options:     options,
                                UIDMappings: uidMapping,
                                GIDMappings: gidMapping,
                        })
                }
                return nil
        }
}

// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
        mountInfo, err := lookupMount(path)
        if err != nil {
                return err
        }

        // Make sure source mount point is shared.
        optsSplit := strings.Split(mountInfo.Optional, " ")
        for _, opt := range optsSplit {
                if strings.HasPrefix(opt, "shared:") {
                        return nil
                }
        }

        return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
}

// ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
        mountInfo, err := lookupMount(path)
        if err != nil {
                return err
        }
        // Make sure source mount point is shared.
        optsSplit := strings.Split(mountInfo.Optional, " ")
        for _, opt := range optsSplit {
                if strings.HasPrefix(opt, "shared:") {
                        return nil
                } else if strings.HasPrefix(opt, "master:") {
                        return nil
                }
        }
        return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
}

// getDeviceUserGroupID() is used to find the right uid/gid
// value for the device node created in the container namespace.
// The runtime executes mknod() and chmod()s the created
// device with the values returned here.
//
// On Linux, uid and gid are sufficient and the user/groupname do not
// need to be resolved.
//
// TODO(mythi): In case of user namespaces, the runtime simply bind
// mounts the devices from the host. Additional logic is needed
// to check that the runtimes effective UID/GID on the host has the
// permissions to access the device node and/or the right user namespace
// mappings are created.
//
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
        if runAsVal != nil {
                return uint32(runAsVal.GetValue())
        }
        return 0
}

// WithDevices sets the provided devices onto the container spec
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                if s.Linux == nil {
                        s.Linux = &runtimespec.Linux{}
                }
                if s.Linux.Resources == nil {
                        s.Linux.Resources = &runtimespec.LinuxResources{}
                }

                oldDevices := len(s.Linux.Devices)

                for _, device := range config.GetDevices() {
                        path, err := osi.ResolveSymbolicLink(device.HostPath)
                        if err != nil {
                                return err
                        }

                        o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
                        if err := o(ctx, client, c, s); err != nil {
                                return err
                        }
                }

                if enableDeviceOwnershipFromSecurityContext {
                        UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
                        GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
                        // Loop all new devices added by oci.WithDevices() to update their
                        // dev.UID/dev.GID.
                        //
                        // non-zero UID/GID from SecurityContext is used to override host's
                        // device UID/GID for the container.
                        for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
                                if UID != 0 {
                                        *s.Linux.Devices[idx].UID = UID
                                }
                                if GID != 0 {
                                        *s.Linux.Devices[idx].GID = GID
                                }
                        }
                }
                return nil
        }
}

// WithResources sets the provided resource restrictions
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                if resources == nil {
                        return nil
                }
                if s.Linux == nil {
                        s.Linux = &runtimespec.Linux{}
                }
                if s.Linux.Resources == nil {
                        s.Linux.Resources = &runtimespec.LinuxResources{}
                }
                if s.Linux.Resources.CPU == nil {
                        s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
                }
                if s.Linux.Resources.Memory == nil {
                        s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
                }
                var (
                        p         = uint64(resources.GetCpuPeriod())
                        q         = resources.GetCpuQuota()
                        shares    = uint64(resources.GetCpuShares())
                        limit     = resources.GetMemoryLimitInBytes()
                        swapLimit = resources.GetMemorySwapLimitInBytes()
                        hugepages = resources.GetHugepageLimits()
                )

                if p != 0 {
                        s.Linux.Resources.CPU.Period = &p
                }
                if q != 0 {
                        s.Linux.Resources.CPU.Quota = &q
                }
                if shares != 0 {
                        s.Linux.Resources.CPU.Shares = &shares
                }
                if cpus := resources.GetCpusetCpus(); cpus != "" {
                        s.Linux.Resources.CPU.Cpus = cpus
                }
                if mems := resources.GetCpusetMems(); mems != "" {
                        s.Linux.Resources.CPU.Mems = mems
                }
                if limit != 0 {
                        s.Linux.Resources.Memory.Limit = &limit
                        // swap/memory limit should be equal to prevent container from swapping by default
                        if swapLimit == 0 && SwapControllerAvailable() {
                                s.Linux.Resources.Memory.Swap = &limit
                        }
                }
                if swapLimit != 0 && SwapControllerAvailable() {
                        s.Linux.Resources.Memory.Swap = &swapLimit
                }

                if !disableHugetlbController {
                        if isHugetlbControllerPresent() {
                                for _, limit := range hugepages {
                                        s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
                                                Pagesize: limit.PageSize,
                                                Limit:    limit.Limit,
                                        })
                                }
                        } else {
                                if !tolerateMissingHugetlbController {
                                        return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
                                                "Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
                                }
                                log.L.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
                        }
                }

                if unified := resources.GetUnified(); unified != nil {
                        if s.Linux.Resources.Unified == nil {
                                s.Linux.Resources.Unified = make(map[string]string)
                        }
                        for k, v := range unified {
                                s.Linux.Resources.Unified[k] = v
                        }
                }
                return nil
        }
}

// WithOOMScoreAdj sets the oom score
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Process == nil {
                        s.Process = &runtimespec.Process{}
                }

                resources := config.GetLinux().GetResources()
                if resources == nil {
                        return nil
                }
                adj := int(resources.GetOomScoreAdj())
                if restrict {
                        var err error
                        adj, err = restrictOOMScoreAdj(adj)
                        if err != nil {
                                return err
                        }
                }
                s.Process.OOMScoreAdj = &adj
                return nil
        }
}

// WithPodOOMScoreAdj sets the oom score for the pod sandbox
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Process == nil {
                        s.Process = &runtimespec.Process{}
                }
                if restrict {
                        var err error
                        adj, err = restrictOOMScoreAdj(adj)
                        if err != nil {
                                return err
                        }
                }
                s.Process.OOMScoreAdj = &adj
                return nil
        }
}

func getCurrentOOMScoreAdj() (int, error) {
        b, err := os.ReadFile("/proc/self/oom_score_adj")
        if err != nil {
                return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
        }
        s := strings.TrimSpace(string(b))
        i, err := strconv.Atoi(s)
        if err != nil {
                return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
        }
        return i, nil
}

func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
        currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
        if err != nil {
                return preferredOOMScoreAdj, err
        }
        if preferredOOMScoreAdj < currentOOMScoreAdj {
                return currentOOMScoreAdj, nil
        }
        return preferredOOMScoreAdj, nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/errdefs"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func WithProcessCommandLineOrArgsForWindows(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                return errdefs.ErrNotImplemented
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "strings"

        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/oci"
)

// DefaultSandboxCPUshares is default cpu shares for sandbox container.
// TODO(windows): Revisit cpu shares for windows (https://github.com/containerd/cri/issues/1297)
const DefaultSandboxCPUshares = 2

// WithRelativeRoot sets the root for the container
func WithRelativeRoot(root string) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                if s.Root == nil {
                        s.Root = &runtimespec.Root{}
                }
                s.Root.Path = root
                return nil
        }
}

// WithoutRoot sets the root to nil for the container.
func WithoutRoot(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
        s.Root = nil
        return nil
}

// WithProcessArgs sets the process args on the spec based on the image and runtime config
func WithProcessArgs(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                command, args := config.GetCommand(), config.GetArgs()
                // The following logic is migrated from https://github.com/moby/moby/blob/master/daemon/commit.go
                // TODO(random-liu): Clearly define the commands overwrite behavior.
                if len(command) == 0 {
                        // Copy array to avoid data race.
                        if len(args) == 0 {
                                args = append([]string{}, image.Cmd...)
                        }
                        if command == nil {
                                if !(len(image.Entrypoint) == 1 && image.Entrypoint[0] == "") {
                                        command = append([]string{}, image.Entrypoint...)
                                }
                        }
                }
                if len(command) == 0 && len(args) == 0 {
                        return errors.New("no command specified")
                }
                return oci.WithProcessArgs(append(command, args...)...)(ctx, client, c, s)
        }
}

// mounts defines how to sort runtime.Mount.
// This is the same with the Docker implementation:
//
//        https://github.com/moby/moby/blob/17.05.x/daemon/volumes.go#L26
type orderedMounts []*runtime.Mount

// Len returns the number of mounts. Used in sorting.
func (m orderedMounts) Len() int {
        return len(m)
}

// Less returns true if the number of parts (a/b/c would be 3 parts) in the
// mount indexed by parameter 1 is less than that of the mount indexed by
// parameter 2. Used in sorting.
func (m orderedMounts) Less(i, j int) bool {
        return m.parts(i) < m.parts(j)
}

// Swap swaps two items in an array of mounts. Used in sorting
func (m orderedMounts) Swap(i, j int) {
        m[i], m[j] = m[j], m[i]
}

// parts returns the number of parts in the destination of a mount. Used in sorting.
func (m orderedMounts) parts(i int) int {
        return strings.Count(filepath.Clean(m[i].ContainerPath), string(os.PathSeparator))
}

// WithAnnotation sets the provided annotation
func WithAnnotation(k, v string) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Annotations == nil {
                        s.Annotations = make(map[string]string)
                }
                s.Annotations[k] = v
                return nil
        }
}

// WithAdditionalGIDs adds any additional groups listed for a particular user in the
// /etc/groups file of the image's root filesystem to the OCI spec's additionalGids array.
func WithAdditionalGIDs(userstr string) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                if s.Process == nil {
                        s.Process = &runtimespec.Process{}
                }
                gids := s.Process.User.AdditionalGids
                if err := oci.WithAdditionalGIDs(userstr)(ctx, client, c, s); err != nil {
                        return err
                }
                // Merge existing gids and new gids.
                s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, gids)
                return nil
        }
}

func mergeGids(gids1, gids2 []uint32) []uint32 {
        gidsMap := make(map[uint32]struct{})
        for _, gid1 := range gids1 {
                gidsMap[gid1] = struct{}{}
        }
        for _, gid2 := range gids2 {
                gidsMap[gid2] = struct{}{}
        }
        var gids []uint32
        for gid := range gidsMap {
                gids = append(gids, gid)
        }
        sort.Slice(gids, func(i, j int) bool { return gids[i] < gids[j] })
        return gids
}

// WithoutDefaultSecuritySettings removes the default security settings generated on a spec
func WithoutDefaultSecuritySettings(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
        if s.Process == nil {
                s.Process = &runtimespec.Process{}
        }
        // Make sure no default seccomp/apparmor is specified
        s.Process.ApparmorProfile = ""
        if s.Linux != nil {
                s.Linux.Seccomp = nil
        }
        // Remove default rlimits (See https://github.com/containerd/cri/issues/515)
        s.Process.Rlimits = nil
        return nil
}

// WithCapabilities sets the provided capabilities from the security context
func WithCapabilities(sc *runtime.LinuxContainerSecurityContext, allCaps []string) oci.SpecOpts {
        capabilities := sc.GetCapabilities()
        if capabilities == nil {
                return nullOpt
        }

        var opts []oci.SpecOpts
        // Add/drop all capabilities if "all" is specified, so that
        // following individual add/drop could still work. E.g.
        // AddCapabilities: []string{"ALL"}, DropCapabilities: []string{"CHOWN"}
        // will be all capabilities without `CAP_CHOWN`.
        if util.InStringSlice(capabilities.GetAddCapabilities(), "ALL") {
                opts = append(opts, oci.WithCapabilities(allCaps))
        }
        if util.InStringSlice(capabilities.GetDropCapabilities(), "ALL") {
                opts = append(opts, oci.WithCapabilities(nil))
        }

        var caps []string
        for _, c := range capabilities.GetAddCapabilities() {
                if strings.ToUpper(c) == "ALL" {
                        continue
                }
                // Capabilities in CRI doesn't have `CAP_` prefix, so add it.
                caps = append(caps, "CAP_"+strings.ToUpper(c))
        }
        opts = append(opts, oci.WithAddedCapabilities(caps))

        caps = []string{}
        for _, c := range capabilities.GetDropCapabilities() {
                if strings.ToUpper(c) == "ALL" {
                        continue
                }
                caps = append(caps, "CAP_"+strings.ToUpper(c))
        }
        opts = append(opts, oci.WithDroppedCapabilities(caps))
        return oci.Compose(opts...)
}

func nullOpt(_ context.Context, _ oci.Client, _ *containers.Container, _ *runtimespec.Spec) error {
        return nil
}

// WithoutAmbientCaps removes the ambient caps from the spec
func WithoutAmbientCaps(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
        if s.Process == nil {
                s.Process = &runtimespec.Process{}
        }
        if s.Process.Capabilities == nil {
                s.Process.Capabilities = &runtimespec.LinuxCapabilities{}
        }
        s.Process.Capabilities.Ambient = nil
        return nil
}

// WithDisabledCgroups clears the Cgroups Path from the spec
func WithDisabledCgroups(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
        if s.Linux == nil {
                s.Linux = &runtimespec.Linux{}
        }
        s.Linux.CgroupsPath = ""
        return nil
}

// WithSelinuxLabels sets the mount and process labels
func WithSelinuxLabels(process, mount string) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                if s.Linux == nil {
                        s.Linux = &runtimespec.Linux{}
                }
                if s.Process == nil {
                        s.Process = &runtimespec.Process{}
                }
                s.Linux.MountLabel = mount
                s.Process.SelinuxLabel = process
                return nil
        }
}

// WithSysctls sets the provided sysctls onto the spec
func WithSysctls(sysctls map[string]string) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Linux == nil {
                        s.Linux = &runtimespec.Linux{}
                }
                if s.Linux.Sysctl == nil {
                        s.Linux.Sysctl = make(map[string]string)
                }
                for k, v := range sysctls {
                        s.Linux.Sysctl[k] = v
                }
                return nil
        }
}

// WithSupplementalGroups sets the supplemental groups for the process
func WithSupplementalGroups(groups []int64) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Process == nil {
                        s.Process = &runtimespec.Process{}
                }
                var guids []uint32
                for _, g := range groups {
                        guids = append(guids, uint32(g))
                }
                s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, guids)
                return nil
        }
}

// WithDefaultSandboxShares sets the default sandbox CPU shares
func WithDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
        if s.Linux == nil {
                s.Linux = &runtimespec.Linux{}
        }
        if s.Linux.Resources == nil {
                s.Linux.Resources = &runtimespec.LinuxResources{}
        }
        if s.Linux.Resources.CPU == nil {
                s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
        }
        i := uint64(DefaultSandboxCPUshares)
        s.Linux.Resources.CPU.Shares = &i
        return nil
}

// WithoutNamespace removes the provided namespace
func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Linux == nil {
                        return nil
                }
                var namespaces []runtimespec.LinuxNamespace
                for i, ns := range s.Linux.Namespaces {
                        if ns.Type != t {
                                namespaces = append(namespaces, s.Linux.Namespaces[i])
                        }
                }
                s.Linux.Namespaces = namespaces
                return nil
        }
}

// WithPodNamespaces sets the pod namespaces for the container
func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts {
        namespaces := config.GetNamespaceOptions()

        opts := []oci.SpecOpts{
                oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.NetworkNamespace, Path: GetNetworkNamespace(sandboxPid)}),
                oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.IPCNamespace, Path: GetIPCNamespace(sandboxPid)}),
                oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UTSNamespace, Path: GetUTSNamespace(sandboxPid)}),
        }
        if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER {
                opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)}))
        }

        if namespaces.GetUsernsOptions() != nil {
                switch namespaces.GetUsernsOptions().GetMode() {
                case runtime.NamespaceMode_NODE:
                        // Nothing to do. Not adding userns field uses the node userns.
                case runtime.NamespaceMode_POD:
                        opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)}))
                        opts = append(opts, oci.WithUserNamespace(uids, gids))
                }
        }

        return oci.Compose(opts...)
}

const (
        // netNSFormat is the format of network namespace of a process.
        netNSFormat = "/proc/%v/ns/net"
        // ipcNSFormat is the format of ipc namespace of a process.
        ipcNSFormat = "/proc/%v/ns/ipc"
        // utsNSFormat is the format of uts namespace of a process.
        utsNSFormat = "/proc/%v/ns/uts"
        // pidNSFormat is the format of pid namespace of a process.
        pidNSFormat = "/proc/%v/ns/pid"
        // userNSFormat is the format of user namespace of a process.
        userNSFormat = "/proc/%v/ns/user"
)

// GetNetworkNamespace returns the network namespace of a process.
func GetNetworkNamespace(pid uint32) string {
        return fmt.Sprintf(netNSFormat, pid)
}

// GetIPCNamespace returns the ipc namespace of a process.
func GetIPCNamespace(pid uint32) string {
        return fmt.Sprintf(ipcNSFormat, pid)
}

// GetUTSNamespace returns the uts namespace of a process.
func GetUTSNamespace(pid uint32) string {
        return fmt.Sprintf(utsNSFormat, pid)
}

// GetPIDNamespace returns the pid namespace of a process.
func GetPIDNamespace(pid uint32) string {
        return fmt.Sprintf(pidNSFormat, pid)
}

// GetUserNamespace returns the user namespace of a process.
func GetUserNamespace(pid uint32) string {
        return fmt.Sprintf(userNSFormat, pid)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opts

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "strings"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/oci"
        osinterface "github.com/containerd/containerd/v2/pkg/os"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// namedPipePath returns true if the given path is to a named pipe.
func namedPipePath(p string) bool {
        return strings.HasPrefix(p, `\\.\pipe\`)
}

// cleanMount returns a cleaned version of the mount path. The input is returned
// as-is if it is a named pipe path.
func cleanMount(p string) string {
        if namedPipePath(p) {
                return p
        }
        return filepath.Clean(p)
}

func parseMount(osi osinterface.OS, mount *runtime.Mount) (*runtimespec.Mount, error) {
        var (
                dst = mount.GetContainerPath()
                src = mount.GetHostPath()
        )
        // In the case of a named pipe mount on Windows, don't stat the file
        // or do other operations that open it, as that could interfere with
        // the listening process. filepath.Clean also breaks named pipe
        // paths, so don't use it.
        if !namedPipePath(src) {
                if _, err := osi.Stat(src); err != nil {
                        // Create the host path if it doesn't exist. This will align
                        // the behavior with the Linux implementation, but it doesn't
                        // align with Docker's behavior on Windows.
                        if !os.IsNotExist(err) {
                                return nil, fmt.Errorf("failed to stat %q: %w", src, err)
                        }
                        if err := osi.MkdirAll(src, 0755); err != nil {
                                return nil, fmt.Errorf("failed to mkdir %q: %w", src, err)
                        }
                }
                var err error
                originalSrc := src
                src, err = osi.ResolveSymbolicLink(src)
                if err != nil {
                        return nil, fmt.Errorf("failed to resolve symlink %q: %w", originalSrc, err)
                }
                // hcsshim requires clean path, especially '/' -> '\'. Additionally,
                // for the destination, absolute paths should have the C: prefix.
                src = filepath.Clean(src)

                // filepath.Clean adds a '.' at the end if the path is a
                // drive (like Z:, E: etc.). Keeping this '.' in the path
                // causes incorrect parameter error when starting the
                // container on windows.  Remove it here.
                if !(len(dst) == 2 && dst[1] == ':') {
                        dst = filepath.Clean(dst)
                        if dst[0] == '\\' {
                                dst = "C:" + dst
                        }
                } else if dst[0] == 'c' || dst[0] == 'C' {
                        return nil, fmt.Errorf("destination path can not be C drive")
                }
        }

        var options []string
        // NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
        // is readonly. This is different from docker's behavior, but make more sense.
        if mount.GetReadonly() {
                options = append(options, "ro")
        } else {
                options = append(options, "rw")
        }
        return &runtimespec.Mount{Source: src, Destination: dst, Options: options}, nil
}

// WithWindowsMounts sorts and adds runtime and CRI mounts to the spec for
// windows container.
func WithWindowsMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) error {
                // mergeMounts merge CRI mounts with extra mounts. If a mount destination
                // is mounted by both a CRI mount and an extra mount, the CRI mount will
                // be kept.
                var (
                        criMounts = config.GetMounts()
                        mounts    = append([]*runtime.Mount{}, criMounts...)
                )
                // Copy all mounts from extra mounts, except for mounts overridden by CRI.
                for _, e := range extra {
                        found := false
                        for _, c := range criMounts {
                                if cleanMount(e.ContainerPath) == cleanMount(c.ContainerPath) {
                                        found = true
                                        break
                                }
                        }
                        if !found {
                                mounts = append(mounts, e)
                        }
                }

                // Sort mounts in number of parts. This ensures that high level mounts don't
                // shadow other mounts.
                sort.Stable(orderedMounts(mounts))

                // Copy all mounts from default mounts, except for
                // mounts overridden by supplied mount;
                mountSet := make(map[string]struct{})
                for _, m := range mounts {
                        mountSet[cleanMount(m.ContainerPath)] = struct{}{}
                }

                defaultMounts := s.Mounts
                s.Mounts = nil

                for _, m := range defaultMounts {
                        dst := cleanMount(m.Destination)
                        if _, ok := mountSet[dst]; ok {
                                // filter out mount overridden by a supplied mount
                                continue
                        }
                        s.Mounts = append(s.Mounts, m)
                }

                for _, mount := range mounts {
                        parsedMount, err := parseMount(osi, mount)
                        if err != nil {
                                return err
                        }
                        s.Mounts = append(s.Mounts, *parsedMount)
                }
                return nil
        }
}

// WithWindowsResources sets the provided resource restrictions for windows.
func WithWindowsResources(resources *runtime.WindowsContainerResources) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if resources == nil {
                        return nil
                }
                if s.Windows == nil {
                        s.Windows = &runtimespec.Windows{}
                }
                if s.Windows.Resources == nil {
                        s.Windows.Resources = &runtimespec.WindowsResources{}
                }
                if s.Windows.Resources.Memory == nil {
                        s.Windows.Resources.Memory = &runtimespec.WindowsMemoryResources{}
                }

                var (
                        count  = uint64(resources.GetCpuCount())
                        shares = uint16(resources.GetCpuShares())
                        max    = uint16(resources.GetCpuMaximum())
                        limit  = uint64(resources.GetMemoryLimitInBytes())
                )
                if s.Windows.Resources.CPU == nil && (count != 0 || shares != 0 || max != 0) {
                        s.Windows.Resources.CPU = &runtimespec.WindowsCPUResources{}
                }
                if count != 0 {
                        s.Windows.Resources.CPU.Count = &count
                }
                if shares != 0 {
                        s.Windows.Resources.CPU.Shares = &shares
                }
                if max != 0 {
                        s.Windows.Resources.CPU.Maximum = &max
                }
                if limit != 0 {
                        s.Windows.Resources.Memory.Limit = &limit
                }
                return nil
        }
}

// WithWindowsDefaultSandboxShares sets the default sandbox CPU shares
func WithWindowsDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
        if s.Windows == nil {
                s.Windows = &runtimespec.Windows{}
        }
        if s.Windows.Resources == nil {
                s.Windows.Resources = &runtimespec.WindowsResources{}
        }
        if s.Windows.Resources.CPU == nil {
                s.Windows.Resources.CPU = &runtimespec.WindowsCPUResources{}
        }
        i := uint16(DefaultSandboxCPUshares)
        s.Windows.Resources.CPU.Shares = &i
        return nil
}

// WithWindowsCredentialSpec assigns `credentialSpec` to the
// `runtime.Spec.Windows.CredentialSpec` field.
func WithWindowsCredentialSpec(credentialSpec string) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
                if s.Windows == nil {
                        s.Windows = &runtimespec.Windows{}
                }
                s.Windows.CredentialSpec = credentialSpec
                return nil
        }
}

// WithWindowsDevices sets the provided devices onto the container spec
func WithWindowsDevices(config *runtime.ContainerConfig) oci.SpecOpts {
        return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
                for _, device := range config.GetDevices() {
                        if device.ContainerPath != "" {
                                return fmt.Errorf("unexpected ContainerPath %s, must be empty", device.ContainerPath)
                        }

                        if device.Permissions != "" {
                                return fmt.Errorf("unexpected Permissions %s, must be empty", device.Permissions)
                        }

                        hostPath := device.HostPath
                        if strings.HasPrefix(hostPath, "class/") {
                                hostPath = "class://" + strings.TrimPrefix(hostPath, "class/")
                        }

                        idType, id, ok := strings.Cut(hostPath, "://")
                        if !ok {
                                return fmt.Errorf("unrecognised HostPath format %v, must match IDType://ID", device.HostPath)
                        }

                        o := oci.WithWindowsDevice(idType, id)
                        if err := o(ctx, client, c, s); err != nil {
                                return fmt.Errorf("failed adding device with HostPath %v: %w", device.HostPath, err)
                        }
                }
                return nil
        }
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "fmt"

        "github.com/containerd/containerd/v2/pkg/blockio"
        "github.com/containerd/log"
)

// blockIOClassFromAnnotations examines container and pod annotations of a
// container and returns its effective blockio class.
func (c *criService) blockIOClassFromAnnotations(containerName string, containerAnnotations, podAnnotations map[string]string) (string, error) {
        cls, err := blockio.ContainerClassFromAnnotations(containerName, containerAnnotations, podAnnotations)
        if err != nil {
                return "", err
        }

        if cls != "" && !blockio.IsEnabled() {
                if c.config.ContainerdConfig.IgnoreBlockIONotEnabledErrors {
                        cls = ""
                        log.L.Debugf("continuing create container %s, ignoring blockio not enabled (%v)", containerName, err)
                } else {
                        return "", fmt.Errorf("blockio disabled, refusing to set blockio class of container %q to %q", containerName, cls)
                }
        }
        return cls, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "fmt"
        "os"
        "path/filepath"
        "sync"

        "github.com/containerd/go-cni"
        "github.com/containerd/log"
        "github.com/fsnotify/fsnotify"
)

// cniNetConfSyncer is used to reload cni network conf triggered by fs change
// events.
type cniNetConfSyncer struct {
        // only used for lastSyncStatus
        sync.RWMutex
        lastSyncStatus error

        watcher   *fsnotify.Watcher
        confDir   string
        netPlugin cni.CNI
        loadOpts  []cni.Opt
}

// newCNINetConfSyncer creates cni network conf syncer.
func newCNINetConfSyncer(confDir string, netPlugin cni.CNI, loadOpts []cni.Opt) (*cniNetConfSyncer, error) {
        watcher, err := fsnotify.NewWatcher()
        if err != nil {
                return nil, fmt.Errorf("failed to create fsnotify watcher: %w", err)
        }

        // /etc/cni has to be readable for non-root users (0755), because /etc/cni/tuning/allowlist.conf is used for rootless mode too.
        // This file was introduced in CNI plugins 1.2.0 (https://github.com/containernetworking/plugins/pull/693), and its path is hard-coded.
        confDirParent := filepath.Dir(confDir)
        if err := os.MkdirAll(confDirParent, 0755); err != nil {
                return nil, fmt.Errorf("failed to create the parent of the cni conf dir=%s: %w", confDirParent, err)
        }

        if err := os.MkdirAll(confDir, 0700); err != nil {
                return nil, fmt.Errorf("failed to create cni conf dir=%s for watch: %w", confDir, err)
        }

        if err := watcher.Add(confDir); err != nil {
                return nil, fmt.Errorf("failed to watch cni conf dir %s: %w", confDir, err)
        }

        syncer := &cniNetConfSyncer{
                watcher:   watcher,
                confDir:   confDir,
                netPlugin: netPlugin,
                loadOpts:  loadOpts,
        }

        if err := syncer.netPlugin.Load(syncer.loadOpts...); err != nil {
                log.L.WithError(err).Error("failed to load cni during init, please check CRI plugin status before setting up network for pods")
                syncer.updateLastStatus(err)
        }
        return syncer, nil
}

// syncLoop monitors any fs change events from cni conf dir and tries to reload
// cni configuration.
func (syncer *cniNetConfSyncer) syncLoop() error {
        for {
                select {
                case event, ok := <-syncer.watcher.Events:
                        if !ok {
                                log.L.Debugf("cni watcher channel is closed")
                                return nil
                        }
                        // Only reload config when receiving write/rename/remove
                        // events
                        //
                        // TODO(fuweid): Might only reload target cni config
                        // files to prevent no-ops.
                        if event.Has(fsnotify.Chmod) || event.Has(fsnotify.Create) {
                                log.L.Debugf("ignore event from cni conf dir: %s", event)
                                continue
                        }
                        log.L.Debugf("receiving change event from cni conf dir: %s", event)

                        lerr := syncer.netPlugin.Load(syncer.loadOpts...)
                        if lerr != nil {
                                log.L.WithError(lerr).
                                        Errorf("failed to reload cni configuration after receiving fs change event(%s)", event)
                        }
                        syncer.updateLastStatus(lerr)

                case err := <-syncer.watcher.Errors:
                        if err != nil {
                                log.L.WithError(err).Error("failed to continue sync cni conf change")
                                return err
                        }
                }
        }
}

// lastStatus retrieves last sync status.
func (syncer *cniNetConfSyncer) lastStatus() error {
        syncer.RLock()
        defer syncer.RUnlock()
        return syncer.lastSyncStatus
}

// updateLastStatus will be called after every single cni load.
func (syncer *cniNetConfSyncer) updateLastStatus(err error) {
        syncer.Lock()
        defer syncer.Unlock()
        syncer.lastSyncStatus = err
}

// stop stops watcher in the syncLoop.
func (syncer *cniNetConfSyncer) stop() error {
        return syncer.watcher.Close()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "io"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/log"
        "k8s.io/client-go/tools/remotecommand"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        cio "github.com/containerd/containerd/v2/internal/cri/io"
)

// Attach prepares a streaming endpoint to attach to a running container, and returns the address.
func (c *criService) Attach(ctx context.Context, r *runtime.AttachRequest) (*runtime.AttachResponse, error) {
        cntr, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("failed to find container in store: %w", err)
        }
        state := cntr.Status.Get().State()
        if state != runtime.ContainerState_CONTAINER_RUNNING {
                return nil, fmt.Errorf("container is in %s state", criContainerStateToString(state))
        }
        return c.streamServer.GetAttach(r)
}

func (c *criService) attachContainer(ctx context.Context, id string, stdin io.Reader, stdout, stderr io.WriteCloser,
        tty bool, resize <-chan remotecommand.TerminalSize) error {
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()
        // Get container from our container store.
        cntr, err := c.containerStore.Get(id)
        if err != nil {
                return fmt.Errorf("failed to find container %q in store: %w", id, err)
        }
        id = cntr.ID

        state := cntr.Status.Get().State()
        if state != runtime.ContainerState_CONTAINER_RUNNING {
                return fmt.Errorf("container is in %s state", criContainerStateToString(state))
        }

        task, err := cntr.Container.Task(ctx, nil)
        if err != nil {
                return fmt.Errorf("failed to load task: %w", err)
        }
        handleResizing(ctx, resize, func(size remotecommand.TerminalSize) {
                if err := task.Resize(ctx, uint32(size.Width), uint32(size.Height)); err != nil {
                        log.G(ctx).WithError(err).Errorf("Failed to resize task %q console", id)
                }
        })

        opts := cio.AttachOptions{
                Stdin:     stdin,
                Stdout:    stdout,
                Stderr:    stderr,
                Tty:       tty,
                StdinOnce: cntr.Config.StdinOnce,
                CloseStdin: func() error {
                        return task.CloseIO(ctx, containerd.WithStdinCloser)
                },
        }
        // TODO(random-liu): Figure out whether we need to support historical output.
        cntr.IO.Attach(opts)
        return nil
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "archive/tar"
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "strings"
        "time"

        crmetadata "github.com/checkpoint-restore/checkpointctl/lib"
        "github.com/checkpoint-restore/go-criu/v7"
        "github.com/containerd/containerd/api/types/runc/options"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/archive"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/log"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"

        "github.com/containerd/containerd/v2/client"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// PodCriuVersion is the version of CRIU needed for
// checkpointing and restoring containers out of and into Pods.
const podCriuVersion = 31600

// CheckForCriu uses CRIU's go bindings to check if the CRIU
// binary exists and if it at least the version Podman needs.
func checkForCriu(version int) error {
        c := criu.MakeCriu()
        criuVersion, err := c.GetCriuVersion()
        if err != nil {
                return fmt.Errorf("failed to check for criu version: %w", err)
        }

        if criuVersion >= version {
                return nil
        }
        return fmt.Errorf("checkpoint/restore requires at least CRIU %d, current version is %d", version, criuVersion)
}

func (c *criService) CheckpointContainer(ctx context.Context, r *runtime.CheckpointContainerRequest) (*runtime.CheckpointContainerResponse, error) {
        start := time.Now()
        if err := checkForCriu(podCriuVersion); err != nil {
                // This is the wrong error message and needs to be adapted once
                // Kubernetes (the e2e_node/checkpoint) test has been changed to
                // handle too old or missing CRIU error messages.
                errorMessage := fmt.Sprintf(
                        "CRIU binary not found or too old (<%d). Failed to checkpoint container %q",
                        podCriuVersion,
                        r.GetContainerId(),
                )
                log.G(ctx).WithError(err).Errorf(errorMessage)
                return nil, fmt.Errorf(
                        "%s: %w",
                        errorMessage,
                        err,
                )
        }

        container, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find container %q: %w", r.GetContainerId(), err)
        }

        state := container.Status.Get().State()
        if state != runtime.ContainerState_CONTAINER_RUNNING {
                return nil, fmt.Errorf(
                        "container %q is in %s state. only %s containers can be checkpointed",
                        r.GetContainerId(),
                        criContainerStateToString(state),
                        criContainerStateToString(runtime.ContainerState_CONTAINER_RUNNING),
                )
        }

        imageRef := container.ImageRef
        image, err := c.GetImage(imageRef)
        if err != nil {
                return nil, fmt.Errorf("getting container image failed: %w", err)
        }

        i, err := container.Container.Info(ctx)
        if err != nil {
                return nil, fmt.Errorf("get container info: %w", err)
        }

        configJSON, err := json.Marshal(&crmetadata.ContainerConfig{
                ID:   container.ID,
                Name: container.Name,
                RootfsImageName: func() string {
                        if len(image.References) > 0 {
                                return image.References[0]
                        }
                        return ""
                }(),
                RootfsImageRef: imageRef,
                OCIRuntime:     i.Runtime.Name,
                RootfsImage:    container.Config.GetImage().UserSpecifiedImage,
                CheckpointedAt: time.Now(),
                CreatedTime:    i.CreatedAt,
        })
        if err != nil {
                return nil, fmt.Errorf("generating container config JSON failed: %w", err)
        }

        task, err := container.Container.Task(ctx, nil)
        if err != nil {
                return nil, fmt.Errorf("failed to get task for container %q: %w", r.GetContainerId(), err)
        }
        img, err := task.Checkpoint(ctx, []client.CheckpointTaskOpts{withCheckpointOpts(i.Runtime.Name, c.getContainerRootDir(r.GetContainerId()))}...)
        if err != nil {
                return nil, fmt.Errorf("checkpointing container %q failed: %w", r.GetContainerId(), err)
        }

        // the checkpoint image has been provided as an index with manifests representing the tar of criu data, the rw layer, and the config
        var (
                index        v1.Index
                rawIndex     []byte
                targetDesc   = img.Target()
                contentStore = img.ContentStore()
        )

        rawIndex, err = content.ReadBlob(ctx, contentStore, targetDesc)
        if err != nil {
                return nil, fmt.Errorf("failed to retrieve checkpoint index blob from content store: %w", err)
        }
        if err = json.Unmarshal(rawIndex, &index); err != nil {
                return nil, fmt.Errorf("failed to unmarshall blob into checkpoint data OCI index: %w", err)
        }

        cpPath := filepath.Join(c.getContainerRootDir(r.GetContainerId()), "ctrd-checkpoint")
        if err := os.MkdirAll(cpPath, 0o700); err != nil {
                return nil, err
        }
        defer os.RemoveAll(cpPath)

        if err := os.WriteFile(filepath.Join(cpPath, crmetadata.ConfigDumpFile), configJSON, 0o600); err != nil {
                return nil, err
        }

        // walk the manifests and pull out the blobs that we need to save in the checkpoint tarball:
        // - the checkpoint criu data
        // - the rw diff tarball
        // - the spec blob
        for _, manifest := range index.Manifests {
                switch manifest.MediaType {
                case images.MediaTypeContainerd1Checkpoint:
                        if err := writeCriuCheckpointData(ctx, contentStore, manifest, cpPath); err != nil {
                                return nil, fmt.Errorf("failed to copy CRIU checkpoint blob to checkpoint dir: %w", err)
                        }
                case v1.MediaTypeImageLayerGzip:
                        if err := writeRootFsDiffTar(ctx, contentStore, manifest, cpPath); err != nil {
                                return nil, fmt.Errorf("failed to copy rw filesystem layer blob to checkpoint dir: %w", err)
                        }
                case images.MediaTypeContainerd1CheckpointConfig:
                        if err := writeSpecDumpFile(ctx, contentStore, manifest, cpPath); err != nil {
                                return nil, fmt.Errorf("failed to copy container spec blob to checkpoint dir: %w", err)
                        }
                default:
                }
        }

        // write final tarball of all content
        tar := archive.Diff(ctx, "", cpPath)

        outFile, err := os.OpenFile(r.Location, os.O_RDWR|os.O_CREATE, 0600)
        if err != nil {
                return nil, err
        }
        defer outFile.Close()
        _, err = io.Copy(outFile, tar)
        if err != nil {
                return nil, err
        }
        if err := tar.Close(); err != nil {
                return nil, err
        }

        containerCheckpointTimer.WithValues(i.Runtime.Name).UpdateSince(start)

        return &runtime.CheckpointContainerResponse{}, nil
}

func withCheckpointOpts(rt, rootDir string) client.CheckpointTaskOpts {
        return func(r *client.CheckpointTaskInfo) error {
                // Kubernetes currently supports checkpointing of container
                // as part of the Forensic Container Checkpointing KEP.
                // This implies that the container is never stopped
                leaveRunning := true

                switch rt {
                case plugins.RuntimeRuncV2:
                        if r.Options == nil {
                                r.Options = &options.CheckpointOptions{}
                        }
                        opts, _ := r.Options.(*options.CheckpointOptions)

                        opts.Exit = !leaveRunning
                        opts.WorkPath = rootDir
                }
                return nil
        }
}

func writeCriuCheckpointData(ctx context.Context, store content.Store, desc v1.Descriptor, cpPath string) error {
        ra, err := store.ReaderAt(ctx, desc)
        if err != nil {
                return err
        }
        defer ra.Close()

        checkpointDirectory := filepath.Join(cpPath, crmetadata.CheckpointDirectory)
        // This is the criu data tarball. Let's unpack it
        // and put it into the crmetadata.CheckpointDirectory directory.
        if err := os.MkdirAll(checkpointDirectory, 0o700); err != nil {
                return err
        }
        tr := tar.NewReader(content.NewReader(ra))
        for {
                header, err := tr.Next()
                if err != nil {
                        if errors.Is(err, io.EOF) {
                                break
                        }
                        return err
                }
                if strings.Contains(header.Name, "..") {
                        return fmt.Errorf("found illegal string '..' in checkpoint archive")
                }
                destFile, err := os.Create(filepath.Join(checkpointDirectory, header.Name))
                if err != nil {
                        return err
                }
                defer destFile.Close()

                _, err = io.CopyN(destFile, tr, header.Size)
                if err != nil {
                        return err
                }
        }
        return nil
}

func writeRootFsDiffTar(ctx context.Context, store content.Store, desc v1.Descriptor, cpPath string) error {
        ra, err := store.ReaderAt(ctx, desc)
        if err != nil {
                return err
        }
        defer ra.Close()

        // the rw layer tarball
        f, err := os.Create(filepath.Join(cpPath, crmetadata.RootFsDiffTar))
        if err != nil {
                return err
        }
        defer f.Close()

        _, err = io.Copy(f, content.NewReader(ra))
        if err != nil {
                return err
        }

        return nil
}

func writeSpecDumpFile(ctx context.Context, store content.Store, desc v1.Descriptor, cpPath string) error {
        // this is the container spec
        f, err := os.Create(filepath.Join(cpPath, crmetadata.SpecDumpFile))
        if err != nil {
                return err
        }
        defer f.Close()
        data, err := content.ReadBlob(ctx, store, desc)
        if err != nil {
                return err
        }
        var any ptypes.Any
        if err := proto.Unmarshal(data, &any); err != nil {
                return err
        }
        _, err = f.Write(any.Value)
        if err != nil {
                return err
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"
        "path/filepath"
        "strconv"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        "github.com/davecgh/go-spew/spew"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/selinux/go-selinux"
        "github.com/opencontainers/selinux/go-selinux/label"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/internal/cri/annotations"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        cio "github.com/containerd/containerd/v2/internal/cri/io"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        customopts "github.com/containerd/containerd/v2/internal/cri/opts"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/blockio"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/platforms"
)

func init() {
        typeurl.Register(&containerstore.Metadata{},
                "github.com/containerd/cri/pkg/store/container", "Metadata")
}

// CreateContainer creates a new container in the given PodSandbox.
func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateContainerRequest) (_ *runtime.CreateContainerResponse, retErr error) {
        config := r.GetConfig()
        log.G(ctx).Debugf("Container config %+v", config)
        sandboxConfig := r.GetSandboxConfig()
        sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
        if err != nil {
                return nil, fmt.Errorf("failed to find sandbox id %q: %w", r.GetPodSandboxId(), err)
        }

        cstatus, err := c.sandboxService.SandboxStatus(ctx, sandbox.Sandboxer, sandbox.ID, false)
        if err != nil {
                return nil, fmt.Errorf("failed to get controller status: %w", err)
        }

        var (
                sandboxID  = cstatus.SandboxID
                sandboxPid = cstatus.Pid
        )

        // Generate unique id and name for the container and reserve the name.
        // Reserve the container name to avoid concurrent `CreateContainer` request creating
        // the same container.
        id := util.GenerateID()
        metadata := config.GetMetadata()
        if metadata == nil {
                return nil, errors.New("container config must include metadata")
        }
        containerName := metadata.Name
        name := makeContainerName(metadata, sandboxConfig.GetMetadata())
        log.G(ctx).Debugf("Generated id %q for container %q", id, name)
        if err = c.containerNameIndex.Reserve(name, id); err != nil {
                return nil, fmt.Errorf("failed to reserve container name %q: %w", name, err)
        }
        defer func() {
                // Release the name if the function returns with an error.
                if retErr != nil {
                        c.containerNameIndex.ReleaseByName(name)
                }
        }()

        // Create initial internal container metadata.
        meta := containerstore.Metadata{
                ID:        id,
                Name:      name,
                SandboxID: sandboxID,
                Config:    config,
        }

        // Prepare container image snapshot. For container, the image should have
        // been pulled before creating the container, so do not ensure the image.
        image, err := c.LocalResolve(config.GetImage().GetImage())
        if err != nil {
                return nil, fmt.Errorf("failed to resolve image %q: %w", config.GetImage().GetImage(), err)
        }
        containerdImage, err := c.toContainerdImage(ctx, image)
        if err != nil {
                return nil, fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err)
        }

        start := time.Now()

        // Create container root directory.
        containerRootDir := c.getContainerRootDir(id)
        if err = c.os.MkdirAll(containerRootDir, 0755); err != nil {
                return nil, fmt.Errorf("failed to create container root directory %q: %w",
                        containerRootDir, err)
        }
        defer func() {
                if retErr != nil {
                        // Cleanup the container root directory.
                        if err = c.os.RemoveAll(containerRootDir); err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to remove container root directory %q",
                                        containerRootDir)
                        }
                }
        }()
        volatileContainerRootDir := c.getVolatileContainerRootDir(id)
        if err = c.os.MkdirAll(volatileContainerRootDir, 0755); err != nil {
                return nil, fmt.Errorf("failed to create volatile container root directory %q: %w",
                        volatileContainerRootDir, err)
        }
        defer func() {
                if retErr != nil {
                        // Cleanup the volatile container root directory.
                        if err = c.os.RemoveAll(volatileContainerRootDir); err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to remove volatile container root directory %q",
                                        volatileContainerRootDir)
                        }
                }
        }()

        platform, err := c.sandboxService.SandboxPlatform(ctx, sandbox.Sandboxer, sandboxID)
        if err != nil {
                return nil, fmt.Errorf("failed to query sandbox platform: %w", err)
        }

        var volumeMounts []*runtime.Mount
        if !c.config.IgnoreImageDefinedVolumes {
                // Create container image volumes mounts.
                volumeMounts = c.volumeMounts(platform, containerRootDir, config, &image.ImageSpec.Config)
        } else if len(image.ImageSpec.Config.Volumes) != 0 {
                log.G(ctx).Debugf("Ignoring volumes defined in image %v because IgnoreImageDefinedVolumes is set", image.ID)
        }

        ociRuntime, err := c.config.GetSandboxRuntime(sandboxConfig, sandbox.Metadata.RuntimeHandler)
        if err != nil {
                return nil, fmt.Errorf("failed to get sandbox runtime: %w", err)
        }
        var runtimeHandler *runtime.RuntimeHandler
        for _, f := range c.runtimeHandlers {
                f := f
                if f.Name == sandbox.Metadata.RuntimeHandler {
                        runtimeHandler = f
                        break
                }
        }
        log.G(ctx).Debugf("Use OCI runtime %+v for sandbox %q and container %q", ociRuntime, sandboxID, id)

        spec, err := c.buildContainerSpec(
                platform,
                id,
                sandboxID,
                sandboxPid,
                sandbox.NetNSPath,
                containerName,
                containerdImage.Name(),
                config,
                sandboxConfig,
                &image.ImageSpec.Config,
                volumeMounts,
                ociRuntime,
                runtimeHandler,
        )
        if err != nil {
                return nil, fmt.Errorf("failed to generate container %q spec: %w", id, err)
        }

        meta.ProcessLabel = spec.Process.SelinuxLabel

        // handle any KVM based runtime
        if err := modifyProcessLabel(ociRuntime.Type, spec); err != nil {
                return nil, err
        }

        if config.GetLinux().GetSecurityContext().GetPrivileged() {
                // If privileged don't set the SELinux label but still record it on the container so
                // the unused MCS label can be release later
                spec.Process.SelinuxLabel = ""
        }
        defer func() {
                if retErr != nil {
                        selinux.ReleaseLabel(spec.Process.SelinuxLabel)
                }
        }()

        log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec))

        // Grab any platform specific snapshotter opts.
        sOpts, err := snapshotterOpts(config)
        if err != nil {
                return nil, err
        }

        // Set snapshotter before any other options.
        opts := []containerd.NewContainerOpts{
                containerd.WithSnapshotter(c.RuntimeSnapshotter(ctx, ociRuntime)),
                // Prepare container rootfs. This is always writeable even if
                // the container wants a readonly rootfs since we want to give
                // the runtime (runc) a chance to modify (e.g. to create mount
                // points corresponding to spec.Mounts) before making the
                // rootfs readonly (requested by spec.Root.Readonly).
                customopts.WithNewSnapshot(id, containerdImage, sOpts...),
        }
        if len(volumeMounts) > 0 {
                mountMap := make(map[string]string)
                for _, v := range volumeMounts {
                        mountMap[filepath.Clean(v.HostPath)] = v.ContainerPath
                }
                opts = append(opts, customopts.WithVolumes(mountMap, platform))
        }
        meta.ImageRef = image.ID
        meta.StopSignal = image.ImageSpec.Config.StopSignal

        // Validate log paths and compose full container log path.
        if sandboxConfig.GetLogDirectory() != "" && config.GetLogPath() != "" {
                meta.LogPath = filepath.Join(sandboxConfig.GetLogDirectory(), config.GetLogPath())
                log.G(ctx).Debugf("Composed container full log path %q using sandbox log dir %q and container log path %q",
                        meta.LogPath, sandboxConfig.GetLogDirectory(), config.GetLogPath())
        } else {
                log.G(ctx).Infof("Logging will be disabled due to empty log paths for sandbox (%q) or container (%q)",
                        sandboxConfig.GetLogDirectory(), config.GetLogPath())
        }

        var containerIO *cio.ContainerIO
        switch ociRuntime.IOType {
        case criconfig.IOTypeStreaming:
                containerIO, err = cio.NewContainerIO(id,
                        cio.WithStreams(sandbox.Endpoint.Address, config.GetTty(), config.GetStdin()))
        default:
                containerIO, err = cio.NewContainerIO(id,
                        cio.WithNewFIFOs(volatileContainerRootDir, config.GetTty(), config.GetStdin()))
        }
        if err != nil {
                return nil, fmt.Errorf("failed to create container io: %w", err)
        }
        defer func() {
                if retErr != nil {
                        if err := containerIO.Close(); err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to close container io %q", id)
                        }
                }
        }()

        specOpts, err := c.platformSpecOpts(platform, config, &image.ImageSpec.Config)
        if err != nil {
                return nil, fmt.Errorf("failed to get container spec opts: %w", err)
        }

        containerLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, crilabels.ContainerKindContainer)

        // TODO the sandbox in the cache should hold this info
        runtimeName, runtimeOption, err := c.runtimeInfo(ctx, sandboxID)
        if err != nil {
                return nil, fmt.Errorf("unable to get sandbox %q runtime info: %w", sandboxID, err)
        }

        opts = append(opts,
                containerd.WithSpec(spec, specOpts...),
                containerd.WithRuntime(runtimeName, runtimeOption),
                containerd.WithContainerLabels(containerLabels),
                containerd.WithContainerExtension(crilabels.ContainerMetadataExtension, &meta),
        )

        opts = append(opts, containerd.WithSandbox(sandboxID))

        opts = append(opts, c.nri.WithContainerAdjustment())
        defer func() {
                if retErr != nil {
                        deferCtx, deferCancel := util.DeferContext()
                        defer deferCancel()
                        c.nri.UndoCreateContainer(deferCtx, &sandbox, id, spec)
                }
        }()

        var cntr containerd.Container
        if cntr, err = c.client.NewContainer(ctx, id, opts...); err != nil {
                return nil, fmt.Errorf("failed to create containerd container: %w", err)
        }
        defer func() {
                if retErr != nil {
                        deferCtx, deferCancel := util.DeferContext()
                        defer deferCancel()
                        if err := cntr.Delete(deferCtx, containerd.WithSnapshotCleanup); err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to delete containerd container %q", id)
                        }
                }
        }()

        status := containerstore.Status{CreatedAt: time.Now().UnixNano()}
        status = copyResourcesToStatus(spec, status)
        container, err := containerstore.NewContainer(meta,
                containerstore.WithStatus(status, containerRootDir),
                containerstore.WithContainer(cntr),
                containerstore.WithContainerIO(containerIO),
        )
        if err != nil {
                return nil, fmt.Errorf("failed to create internal container object for %q: %w", id, err)
        }
        defer func() {
                if retErr != nil {
                        // Cleanup container checkpoint on error.
                        if err := container.Delete(); err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to cleanup container checkpoint for %q", id)
                        }
                }
        }()

        // Add container into container store.
        if err := c.containerStore.Add(container); err != nil {
                return nil, fmt.Errorf("failed to add container %q into store: %w", id, err)
        }

        c.generateAndSendContainerEvent(ctx, id, sandboxID, runtime.ContainerEventType_CONTAINER_CREATED_EVENT)

        err = c.nri.PostCreateContainer(ctx, &sandbox, &container)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("NRI post-create notification failed")
        }

        containerCreateTimer.WithValues(ociRuntime.Type).UpdateSince(start)

        return &runtime.CreateContainerResponse{ContainerId: id}, nil
}

// volumeMounts sets up image volumes for container. Rely on the removal of container
// root directory to do cleanup. Note that image volume will be skipped, if there is criMounts
// specified with the same destination.
func (c *criService) volumeMounts(platform platforms.Platform, containerRootDir string, containerConfig *runtime.ContainerConfig, config *imagespec.ImageConfig) []*runtime.Mount {
        var uidMappings, gidMappings []*runtime.IDMapping
        if platform.OS == "linux" {
                if usernsOpts := containerConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil {
                        uidMappings = usernsOpts.GetUids()
                        gidMappings = usernsOpts.GetGids()
                }
        }

        criMounts := containerConfig.GetMounts()

        if len(config.Volumes) == 0 {
                return nil
        }
        var mounts []*runtime.Mount
        for dst := range config.Volumes {
                if isInCRIMounts(dst, criMounts) {
                        // Skip the image volume, if there is CRI defined volume mapping.
                        // TODO(random-liu): This should be handled by Kubelet in the future.
                        // Kubelet should decide what to use for image volume, and also de-duplicate
                        // the image volume and user mounts.
                        continue
                }
                volumeID := util.GenerateID()
                src := filepath.Join(containerRootDir, "volumes", volumeID)
                // When the platform OS is Linux, ensure dst is a _Linux_ abs path.
                // We can't use filepath.IsAbs() because, when executing on Windows, it checks for
                // Windows abs paths.
                if platform.OS == "linux" && !strings.HasPrefix(dst, "/") {
                        // On Windows, ToSlash() is needed to ensure the path is a valid Linux path.
                        // On Linux, ToSlash() is a no-op.
                        oldDst := dst
                        dst = filepath.ToSlash(filepath.Join("/", dst))
                        log.L.Debugf("Volume destination %q is not absolute, converted to %q", oldDst, dst)
                }
                // addOCIBindMounts will create these volumes.
                mounts = append(mounts, &runtime.Mount{
                        ContainerPath:  dst,
                        HostPath:       src,
                        SelinuxRelabel: true,
                        UidMappings:    uidMappings,
                        GidMappings:    gidMappings,
                })
        }
        return mounts
}

// runtimeSpec returns a default runtime spec used in cri-containerd.
func (c *criService) runtimeSpec(id string, platform platforms.Platform, baseSpecFile string, opts ...oci.SpecOpts) (*runtimespec.Spec, error) {
        // GenerateSpec needs namespace.
        ctx := util.NamespacedContext()
        container := &containers.Container{ID: id}

        if baseSpecFile != "" {
                baseSpec, err := c.LoadOCISpec(baseSpecFile)
                if err != nil {
                        return nil, fmt.Errorf("can't load base OCI spec %q: %w", baseSpecFile, err)
                }

                spec := oci.Spec{}
                if err := util.DeepCopy(&spec, &baseSpec); err != nil {
                        return nil, fmt.Errorf("failed to clone OCI spec: %w", err)
                }

                // Fix up cgroups path
                applyOpts := append([]oci.SpecOpts{oci.WithNamespacedCgroup()}, opts...)

                if err := oci.ApplyOpts(ctx, nil, container, &spec, applyOpts...); err != nil {
                        return nil, fmt.Errorf("failed to apply OCI options: %w", err)
                }

                return &spec, nil
        }

        spec, err := oci.GenerateSpecWithPlatform(ctx, nil, platforms.Format(platform), container, opts...)
        if err != nil {
                return nil, fmt.Errorf("failed to generate spec: %w", err)
        }

        return spec, nil
}

const (
        // relativeRootfsPath is the rootfs path relative to bundle path.
        relativeRootfsPath = "rootfs"
        // hostnameEnv is the key for HOSTNAME env.
        hostnameEnv = "HOSTNAME"
)

// generateUserString generates valid user string based on OCI Image Spec
// v1.0.0.
//
// CRI defines that the following combinations are valid:
//
// (none) -> ""
// username -> username
// username, uid -> username
// username, uid, gid -> username:gid
// username, gid -> username:gid
// uid -> uid
// uid, gid -> uid:gid
// gid -> error
//
// TODO(random-liu): Add group name support in CRI.
func generateUserString(username string, uid, gid *runtime.Int64Value) (string, error) {
        var userstr, groupstr string
        if uid != nil {
                userstr = strconv.FormatInt(uid.GetValue(), 10)
        }
        if username != "" {
                userstr = username
        }
        if gid != nil {
                groupstr = strconv.FormatInt(gid.GetValue(), 10)
        }
        if userstr == "" {
                if groupstr != "" {
                        return "", fmt.Errorf("user group %q is specified without user", groupstr)
                }
                return "", nil
        }
        if groupstr != "" {
                userstr = userstr + ":" + groupstr
        }
        return userstr, nil
}

// platformSpecOpts adds additional runtime spec options that may rely on
// runtime information (rootfs mounted), or platform specific checks with
// no defined workaround (yet) to specify for other platforms.
func (c *criService) platformSpecOpts(
        platform platforms.Platform,
        config *runtime.ContainerConfig,
        imageConfig *imagespec.ImageConfig,
) ([]oci.SpecOpts, error) {
        var specOpts []oci.SpecOpts

        // First deal with the set of options we can use across platforms currently.
        // Linux user strings have workarounds on other platforms to avoid needing to
        // mount the rootfs, but on Linux hosts it must be mounted
        //
        // TODO(dcantah): I think the seccomp package can be made to compile on
        // !linux and used here as well.
        if platform.OS == "linux" {
                // Set container username. This could only be done by containerd, because it needs
                // access to the container rootfs. Pass user name to containerd, and let it overwrite
                // the spec for us.
                securityContext := config.GetLinux().GetSecurityContext()
                userstr, err := generateUserString(
                        securityContext.GetRunAsUsername(),
                        securityContext.GetRunAsUser(),
                        securityContext.GetRunAsGroup())
                if err != nil {
                        return nil, fmt.Errorf("failed to generate user string: %w", err)
                }
                if userstr == "" {
                        // Lastly, since no user override was passed via CRI try to set via OCI
                        // Image
                        userstr = imageConfig.User
                }
                if userstr != "" {
                        specOpts = append(specOpts, oci.WithUser(userstr))
                }
        }

        // Now grab the truly platform specific options (seccomp, apparmor etc. for linux
        // for example).
        ctrSpecOpts, err := c.containerSpecOpts(config, imageConfig)
        if err != nil {
                return nil, err
        }
        specOpts = append(specOpts, ctrSpecOpts...)

        return specOpts, nil
}

// buildContainerSpec build container's OCI spec depending on controller's target platform OS.
func (c *criService) buildContainerSpec(
        platform platforms.Platform,
        id string,
        sandboxID string,
        sandboxPid uint32,
        netNSPath string,
        containerName string,
        imageName string,
        config *runtime.ContainerConfig,
        sandboxConfig *runtime.PodSandboxConfig,
        imageConfig *imagespec.ImageConfig,
        extraMounts []*runtime.Mount,
        ociRuntime criconfig.Runtime,
        runtimeHandler *runtime.RuntimeHandler,
) (_ *runtimespec.Spec, retErr error) {
        var (
                specOpts []oci.SpecOpts
                err      error

                // Platform helpers
                isLinux   = platform.OS == "linux"
                isWindows = platform.OS == "windows"
                isDarwin  = platform.OS == "darwin"
        )

        switch {
        case isLinux:
                // Generate container mounts.
                // No mounts are passed for other platforms.
                linuxMounts := c.linuxContainerMounts(sandboxID, config)

                specOpts, err = c.buildLinuxSpec(
                        id,
                        sandboxID,
                        sandboxPid,
                        containerName,
                        imageName,
                        config,
                        sandboxConfig,
                        imageConfig,
                        append(linuxMounts, extraMounts...),
                        ociRuntime,
                        runtimeHandler,
                )
        case isWindows:
                specOpts, err = c.buildWindowsSpec(
                        sandboxID,
                        netNSPath,
                        containerName,
                        imageName,
                        config,
                        sandboxConfig,
                        imageConfig,
                        extraMounts,
                        ociRuntime,
                )
        case isDarwin:
                specOpts, err = c.buildDarwinSpec(
                        sandboxID,
                        containerName,
                        imageName,
                        config,
                        sandboxConfig,
                        imageConfig,
                        extraMounts,
                        ociRuntime,
                )
        default:
                return nil, fmt.Errorf("unsupported spec platform: %s", platform.OS)
        }

        if err != nil {
                return nil, fmt.Errorf("failed to generate spec opts: %w", err)
        }

        return c.runtimeSpec(id, platform, ociRuntime.BaseRuntimeSpec, specOpts...)
}

func (c *criService) buildLinuxSpec(
        id string,
        sandboxID string,
        sandboxPid uint32,
        containerName string,
        imageName string,
        config *runtime.ContainerConfig,
        sandboxConfig *runtime.PodSandboxConfig,
        imageConfig *imagespec.ImageConfig,
        extraMounts []*runtime.Mount,
        ociRuntime criconfig.Runtime,
        runtimeHandler *runtime.RuntimeHandler,
) (_ []oci.SpecOpts, retErr error) {
        specOpts := []oci.SpecOpts{
                oci.WithoutRunMount,
        }
        // only clear the default security settings if the runtime does not have a custom
        // base runtime spec spec.  Admins can use this functionality to define
        // default ulimits, seccomp, or other default settings.
        if ociRuntime.BaseRuntimeSpec == "" {
                specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
        }

        specOpts = append(specOpts,
                customopts.WithRelativeRoot(relativeRootfsPath),
                customopts.WithProcessArgs(config, imageConfig),
                oci.WithDefaultPathEnv,
                // this will be set based on the security context below
                oci.WithNewPrivileges,
        )

        if config.GetWorkingDir() != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
        } else if imageConfig.WorkingDir != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
        }

        if config.GetTty() {
                specOpts = append(specOpts, oci.WithTTY)
        }

        // Add HOSTNAME env.
        var (
                err      error
                hostname = sandboxConfig.GetHostname()
        )
        if hostname == "" {
                if hostname, err = c.os.Hostname(); err != nil {
                        return nil, err
                }
        }
        specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))

        // Apply envs from image config first, so that envs from container config
        // can override them.
        env := append([]string{}, imageConfig.Env...)
        for _, e := range config.GetEnvs() {
                env = append(env, e.GetKey()+"="+e.GetValue())
        }
        specOpts = append(specOpts, oci.WithEnv(env))

        securityContext := config.GetLinux().GetSecurityContext()
        labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
        if err != nil {
                return nil, err
        }
        if len(labelOptions) == 0 {
                // Use pod level SELinux config
                if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
                        labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
                        if err != nil {
                                return nil, err
                        }
                }
        }

        processLabel, mountLabel, err := label.InitLabels(labelOptions)
        if err != nil {
                return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
        }
        defer func() {
                if retErr != nil {
                        selinux.ReleaseLabel(processLabel)
                }
        }()

        specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel, runtimeHandler))

        if !c.config.DisableProcMount {
                // Change the default masked/readonly paths to empty slices
                // See https://github.com/containerd/containerd/issues/5029
                // TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec()
                specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{}))

                // Apply masked paths if specified.
                // If the container is privileged, this will be cleared later on.
                if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil {
                        specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths))
                }

                // Apply readonly paths if specified.
                // If the container is privileged, this will be cleared later on.
                if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil {
                        specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths))
                }
        }

        specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
                customopts.WithCapabilities(securityContext, c.allCaps))

        if securityContext.GetPrivileged() {
                if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
                        return nil, errors.New("no privileged container allowed in sandbox")
                }
                specOpts = append(specOpts, oci.WithPrivileged)
                if !ociRuntime.PrivilegedWithoutHostDevices {
                        specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed)
                } else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed {
                        // allow rwm on all devices for the container
                        specOpts = append(specOpts, oci.WithAllDevicesAllowed)
                }
        }

        // Clear all ambient capabilities. The implication of non-root + caps
        // is not clearly defined in Kubernetes.
        // See https://github.com/kubernetes/kubernetes/issues/56374
        // Keep docker's behavior for now.
        specOpts = append(specOpts,
                customopts.WithoutAmbientCaps,
                customopts.WithSelinuxLabels(processLabel, mountLabel),
        )

        // TODO: Figure out whether we should set no new privilege for sandbox container by default
        if securityContext.GetNoNewPrivs() {
                specOpts = append(specOpts, oci.WithNoNewPrivileges)
        }
        // TODO(random-liu): [P1] Set selinux options (privileged or not).
        if securityContext.GetReadonlyRootfs() {
                specOpts = append(specOpts, oci.WithRootFSReadonly())
        }

        if c.config.DisableCgroup {
                specOpts = append(specOpts, customopts.WithDisabledCgroups)
        } else {
                specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
                if sandboxConfig.GetLinux().GetCgroupParent() != "" {
                        cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
                        specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
                }
        }

        supplementalGroups := securityContext.GetSupplementalGroups()

        // Get blockio class
        blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
        if err != nil {
                return nil, fmt.Errorf("failed to set blockio class: %w", err)
        }
        if blockIOClass != "" {
                if linuxBlockIO, err := blockio.ClassNameToLinuxOCI(blockIOClass); err == nil {
                        specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
                } else {
                        return nil, err
                }
        }

        // Get RDT class
        rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
        if err != nil {
                return nil, fmt.Errorf("failed to set RDT class: %w", err)
        }
        if rdtClass != "" {
                specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
        }

        for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
                ociRuntime.PodAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
                ociRuntime.ContainerAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        // Default target PID namespace is the sandbox PID.
        targetPid := sandboxPid
        // If the container targets another container's PID namespace,
        // set targetPid to the PID of that container.
        nsOpts := securityContext.GetNamespaceOptions()
        if nsOpts.GetPid() == runtime.NamespaceMode_TARGET {
                targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId)
                if err != nil {
                        return nil, fmt.Errorf("invalid target container: %w", err)
                }

                status := targetContainer.Status.Get()
                targetPid = status.Pid
        }

        uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions())
        if err != nil {
                return nil, fmt.Errorf("user namespace configuration: %w", err)
        }

        // Check sandbox userns config is consistent with container config.
        sandboxUsernsOpts := sandboxConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
        if !sameUsernsConfig(sandboxUsernsOpts, nsOpts.GetUsernsOptions()) {
                return nil, fmt.Errorf("user namespace config for sandbox is different from container. Sandbox userns config: %v - Container userns config: %v", sandboxUsernsOpts, nsOpts.GetUsernsOptions())
        }

        specOpts = append(specOpts,
                customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj),
                customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids),
                customopts.WithSupplementalGroups(supplementalGroups),
        )
        specOpts = append(
                specOpts,
                annotations.DefaultCRIAnnotations(sandboxID, containerName, imageName, sandboxConfig, false)...,
        )

        // cgroupns is used for hiding /sys/fs/cgroup from containers.
        // For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
        // https://github.com/containers/libpod/issues/4363
        // https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
        if isUnifiedCgroupsMode() && !securityContext.GetPrivileged() {
                specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
        }

        return specOpts, nil
}

func (c *criService) buildWindowsSpec(
        sandboxID string,
        netNSPath string,
        containerName string,
        imageName string,
        config *runtime.ContainerConfig,
        sandboxConfig *runtime.PodSandboxConfig,
        imageConfig *imagespec.ImageConfig,
        extraMounts []*runtime.Mount,
        ociRuntime criconfig.Runtime,
) (_ []oci.SpecOpts, retErr error) {
        var specOpts []oci.SpecOpts
        specOpts = append(specOpts, customopts.WithProcessCommandLineOrArgsForWindows(config, imageConfig))

        // All containers in a pod need to have HostProcess set if it was set on the pod,
        // and vice versa no containers in the pod can be HostProcess if the pods spec
        // didn't have the field set. The only case that is valid is if these are the same value.
        cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
        sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
        if cntrHpc != sandboxHpc {
                return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
        }

        if config.GetWorkingDir() != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
        } else if imageConfig.WorkingDir != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
        } else if cntrHpc {
                specOpts = append(specOpts, oci.WithProcessCwd(`C:\hpc`))
        }

        if config.GetTty() {
                specOpts = append(specOpts, oci.WithTTY)
        }

        // Apply envs from image config first, so that envs from container config
        // can override them.
        env := append([]string{}, imageConfig.Env...)
        for _, e := range config.GetEnvs() {
                env = append(env, e.GetKey()+"="+e.GetValue())
        }
        specOpts = append(specOpts, oci.WithEnv(env))

        specOpts = append(specOpts,
                // Clear the root location since hcsshim expects it.
                // NOTE: readonly rootfs doesn't work on windows.
                customopts.WithoutRoot,
                oci.WithWindowsNetworkNamespace(netNSPath),
                oci.WithHostname(sandboxConfig.GetHostname()),
        )

        specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))

        // Start with the image config user and override below if RunAsUsername is not "".
        username := imageConfig.User

        windowsConfig := config.GetWindows()
        if windowsConfig != nil {
                specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
                securityCtx := windowsConfig.GetSecurityContext()
                if securityCtx != nil {
                        runAsUser := securityCtx.GetRunAsUsername()
                        if runAsUser != "" {
                                username = runAsUser
                        }
                        cs := securityCtx.GetCredentialSpec()
                        if cs != "" {
                                specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
                        }
                }
        }

        // There really isn't a good Windows way to verify that the username is available in the
        // image as early as here like there is for Linux. Later on in the stack hcsshim
        // will handle the behavior of erroring out if the user isn't available in the image
        // when trying to run the init process.
        specOpts = append(specOpts, oci.WithUser(username))

        for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
                ociRuntime.PodAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
                ociRuntime.ContainerAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        specOpts = append(specOpts, customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)))
        specOpts = append(specOpts,
                annotations.DefaultCRIAnnotations(sandboxID, containerName, imageName, sandboxConfig, false)...,
        )

        return specOpts, nil
}

func (c *criService) buildDarwinSpec(
        sandboxID string,
        containerName string,
        imageName string,
        config *runtime.ContainerConfig,
        sandboxConfig *runtime.PodSandboxConfig,
        imageConfig *imagespec.ImageConfig,
        extraMounts []*runtime.Mount,
        ociRuntime criconfig.Runtime,
) (_ []oci.SpecOpts, retErr error) {
        specOpts := []oci.SpecOpts{
                customopts.WithProcessArgs(config, imageConfig),
        }

        if config.GetWorkingDir() != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
        } else if imageConfig.WorkingDir != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
        }

        if config.GetTty() {
                specOpts = append(specOpts, oci.WithTTY)
        }

        // Apply envs from image config first, so that envs from container config
        // can override them.
        env := append([]string{}, imageConfig.Env...)
        for _, e := range config.GetEnvs() {
                env = append(env, e.GetKey()+"="+e.GetValue())
        }
        specOpts = append(specOpts, oci.WithEnv(env))

        specOpts = append(specOpts, customopts.WithDarwinMounts(c.os, config, extraMounts))

        for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
                ociRuntime.PodAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
                ociRuntime.ContainerAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        specOpts = append(specOpts,
                annotations.DefaultCRIAnnotations(sandboxID, containerName, imageName, sandboxConfig, false)...,
        )

        return specOpts, nil
}

// linuxContainerMounts sets up necessary container system file mounts
// including /dev/shm, /etc/hosts and /etc/resolv.conf.
func (c *criService) linuxContainerMounts(sandboxID string, config *runtime.ContainerConfig) []*runtime.Mount {
        var mounts []*runtime.Mount
        securityContext := config.GetLinux().GetSecurityContext()
        var uidMappings, gidMappings []*runtime.IDMapping
        if usernsOpts := securityContext.GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil {
                uidMappings = usernsOpts.GetUids()
                gidMappings = usernsOpts.GetGids()
        }

        if !isInCRIMounts(etcHostname, config.GetMounts()) {
                // /etc/hostname is added since 1.1.6, 1.2.4 and 1.3.
                // For in-place upgrade, the old sandbox doesn't have the hostname file,
                // do not mount this in that case.
                // TODO(random-liu): Remove the check and always mount this when
                // containerd 1.1 and 1.2 are deprecated.
                hostpath := c.getSandboxHostname(sandboxID)
                if _, err := c.os.Stat(hostpath); err == nil {
                        mounts = append(mounts, &runtime.Mount{
                                ContainerPath:  etcHostname,
                                HostPath:       hostpath,
                                Readonly:       securityContext.GetReadonlyRootfs(),
                                SelinuxRelabel: true,
                                UidMappings:    uidMappings,
                                GidMappings:    gidMappings,
                        })
                }
        }

        if !isInCRIMounts(etcHosts, config.GetMounts()) {
                hostpath := c.getSandboxHosts(sandboxID)
                // /etc/hosts could be delegated to remote sandbox controller. That file isn't required to be existed
                // in host side for some sandbox runtimes. Skip it if we don't need it.
                if _, err := c.os.Stat(hostpath); err == nil {
                        mounts = append(mounts, &runtime.Mount{
                                ContainerPath:  etcHosts,
                                HostPath:       hostpath,
                                Readonly:       securityContext.GetReadonlyRootfs(),
                                SelinuxRelabel: true,
                                UidMappings:    uidMappings,
                                GidMappings:    gidMappings,
                        })
                }
        }

        // Mount sandbox resolv.config.
        // TODO: Need to figure out whether we should always mount it as read-only
        if !isInCRIMounts(resolvConfPath, config.GetMounts()) {
                hostpath := c.getResolvPath(sandboxID)
                // The ownership of /etc/resolv.conf could be delegated to remote sandbox controller. That file isn't
                // required to be existed in host side for some sandbox runtimes. Skip it if we don't need it.
                if _, err := c.os.Stat(hostpath); err == nil {
                        mounts = append(mounts, &runtime.Mount{
                                ContainerPath:  resolvConfPath,
                                HostPath:       hostpath,
                                Readonly:       securityContext.GetReadonlyRootfs(),
                                SelinuxRelabel: true,
                                UidMappings:    uidMappings,
                                GidMappings:    gidMappings,
                        })
                }
        }

        if !isInCRIMounts(devShm, config.GetMounts()) {
                sandboxDevShm := c.getSandboxDevShm(sandboxID)
                if securityContext.GetNamespaceOptions().GetIpc() == runtime.NamespaceMode_NODE {
                        sandboxDevShm = devShm
                }
                // The ownership of /dev/shm could be delegated to remote sandbox controller. That file isn't required
                // to be existed in host side for some sandbox runtimes. Skip it if we don't need it.
                if _, err := c.os.Stat(sandboxDevShm); err == nil {
                        mounts = append(mounts, &runtime.Mount{
                                ContainerPath:  devShm,
                                HostPath:       sandboxDevShm,
                                Readonly:       false,
                                SelinuxRelabel: sandboxDevShm != devShm,
                                // XXX: tmpfs support for idmap mounts got merged in
                                // Linux 6.3.
                                // Our Ubuntu 22.04 CI runs with 5.15 kernels, so
                                // disabling idmap mounts for this case makes the CI
                                // happy (the other fs used support idmap mounts in 5.15
                                // kernels).
                                // We can enable this at a later stage, but as this
                                // tmpfs mount is exposed empty to the container (no
                                // prepopulated files) and using the hostIPC with userns
                                // is blocked by k8s, we can just avoid using the
                                // mappings and it should work fine.
                        })
                }
        }
        return mounts
}

func (c *criService) runtimeInfo(ctx context.Context, id string) (string, typeurl.Any, error) {
        sandboxInfo, err := c.client.SandboxStore().Get(ctx, id)
        if err == nil {
                return sandboxInfo.Runtime.Name, sandboxInfo.Runtime.Options, nil
        }
        sandboxContainer, legacyErr := c.client.ContainerService().Get(ctx, id)
        if legacyErr == nil {
                return sandboxContainer.Runtime.Name, sandboxContainer.Runtime.Options, nil
        }

        return "", nil, err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "bufio"
        "errors"
        "fmt"
        "io"
        "os"
        "strconv"
        "strings"

        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/contrib/apparmor"
        "github.com/containerd/containerd/v2/contrib/seccomp"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/oci"

        customopts "github.com/containerd/containerd/v2/internal/cri/opts"
)

const (
        // profileNamePrefix is the prefix for loading profiles on a localhost. Eg. AppArmor localhost/profileName.
        profileNamePrefix = "localhost/" // TODO (mikebrow): get localhost/ & runtime/default from CRI kubernetes/kubernetes#51747
        // runtimeDefault indicates that we should use or create a runtime default profile.
        runtimeDefault = "runtime/default"
        // dockerDefault indicates that we should use or create a docker default profile.
        dockerDefault = "docker/default"
        // appArmorDefaultProfileName is name to use when creating a default apparmor profile.
        appArmorDefaultProfileName = "cri-containerd.apparmor.d"
        // unconfinedProfile is a string indicating one should run a pod/containerd without a security profile
        unconfinedProfile = "unconfined"
        // seccompDefaultProfile is the default seccomp profile.
        seccompDefaultProfile = dockerDefault
)

func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
        var (
                specOpts []oci.SpecOpts
                err      error
        )
        securityContext := config.GetLinux().GetSecurityContext()
        userstr := "0" // runtime default
        if securityContext.GetRunAsUsername() != "" {
                userstr = securityContext.GetRunAsUsername()
        } else if securityContext.GetRunAsUser() != nil {
                userstr = strconv.FormatInt(securityContext.GetRunAsUser().GetValue(), 10)
        } else if imageConfig.User != "" {
                userstr, _, _ = strings.Cut(imageConfig.User, ":")
        }
        specOpts = append(specOpts, customopts.WithAdditionalGIDs(userstr),
                customopts.WithSupplementalGroups(securityContext.GetSupplementalGroups()))

        asp := securityContext.GetApparmor()
        if asp == nil {
                asp, err = generateApparmorSecurityProfile(securityContext.GetApparmorProfile()) //nolint:staticcheck // Deprecated but we don't want to remove yet
                if err != nil {
                        return nil, fmt.Errorf("failed to generate apparmor spec opts: %w", err)
                }
        }
        apparmorSpecOpts, err := generateApparmorSpecOpts(
                asp,
                securityContext.GetPrivileged(),
                c.apparmorEnabled())
        if err != nil {
                return nil, fmt.Errorf("failed to generate apparmor spec opts: %w", err)
        }
        if apparmorSpecOpts != nil {
                specOpts = append(specOpts, apparmorSpecOpts)
        }

        ssp := securityContext.GetSeccomp()
        if ssp == nil {
                ssp, err = generateSeccompSecurityProfile(
                        securityContext.GetSeccompProfilePath(), //nolint:staticcheck // Deprecated but we don't want to remove yet
                        c.config.UnsetSeccompProfile)
                if err != nil {
                        return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err)
                }
        }
        seccompSpecOpts, err := c.generateSeccompSpecOpts(
                ssp,
                securityContext.GetPrivileged(),
                c.seccompEnabled())
        if err != nil {
                return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err)
        }
        if seccompSpecOpts != nil {
                specOpts = append(specOpts, seccompSpecOpts)
        }
        if c.config.EnableCDI {
                specOpts = append(specOpts, customopts.WithCDI(config.Annotations, config.CDIDevices))
        }
        return specOpts, nil
}

func generateSeccompSecurityProfile(profilePath string, unsetProfilePath string) (*runtime.SecurityProfile, error) {
        if profilePath != "" {
                return generateSecurityProfile(profilePath)
        }
        if unsetProfilePath != "" {
                return generateSecurityProfile(unsetProfilePath)
        }
        return nil, nil
}
func generateApparmorSecurityProfile(profilePath string) (*runtime.SecurityProfile, error) {
        if profilePath != "" {
                return generateSecurityProfile(profilePath)
        }
        return nil, nil
}

func generateSecurityProfile(profilePath string) (*runtime.SecurityProfile, error) {
        switch profilePath {
        case runtimeDefault, dockerDefault, "":
                return &runtime.SecurityProfile{
                        ProfileType: runtime.SecurityProfile_RuntimeDefault,
                }, nil
        case unconfinedProfile:
                return &runtime.SecurityProfile{
                        ProfileType: runtime.SecurityProfile_Unconfined,
                }, nil
        default:
                // Require and Trim default profile name prefix
                if !strings.HasPrefix(profilePath, profileNamePrefix) {
                        return nil, fmt.Errorf("invalid profile %q", profilePath)
                }
                return &runtime.SecurityProfile{
                        ProfileType:  runtime.SecurityProfile_Localhost,
                        LocalhostRef: strings.TrimPrefix(profilePath, profileNamePrefix),
                }, nil
        }
}

// generateSeccompSpecOpts generates containerd SpecOpts for seccomp.
func (c *criService) generateSeccompSpecOpts(sp *runtime.SecurityProfile, privileged, seccompEnabled bool) (oci.SpecOpts, error) {
        if privileged {
                // Do not set seccomp profile when container is privileged
                return nil, nil
        }
        if !seccompEnabled {
                if sp != nil {
                        if sp.ProfileType != runtime.SecurityProfile_Unconfined {
                                return nil, errors.New("seccomp is not supported")
                        }
                }
                return nil, nil
        }

        if sp == nil {
                return nil, nil
        }

        if sp.ProfileType != runtime.SecurityProfile_Localhost && sp.LocalhostRef != "" {
                return nil, errors.New("seccomp config invalid LocalhostRef must only be set if ProfileType is Localhost")
        }
        switch sp.ProfileType {
        case runtime.SecurityProfile_Unconfined:
                // Do not set seccomp profile.
                return nil, nil
        case runtime.SecurityProfile_RuntimeDefault:
                return seccomp.WithDefaultProfile(), nil
        case runtime.SecurityProfile_Localhost:
                // trimming the localhost/ prefix just in case even though it should not
                // be necessary with the new SecurityProfile struct
                return seccomp.WithProfile(strings.TrimPrefix(sp.LocalhostRef, profileNamePrefix)), nil
        default:
                return nil, errors.New("seccomp unknown ProfileType")
        }
}

// generateApparmorSpecOpts generates containerd SpecOpts for apparmor.
func generateApparmorSpecOpts(sp *runtime.SecurityProfile, privileged, apparmorEnabled bool) (oci.SpecOpts, error) {
        if !apparmorEnabled {
                // Should fail loudly if user try to specify apparmor profile
                // but we don't support it.
                if sp != nil {
                        if sp.ProfileType != runtime.SecurityProfile_Unconfined {
                                return nil, errors.New("apparmor is not supported")
                        }
                }
                return nil, nil
        }

        if sp == nil {
                // Based on kubernetes#51746, default apparmor profile should be applied
                // for when apparmor is not specified.
                sp, _ = generateSecurityProfile("")
        }

        if sp.ProfileType != runtime.SecurityProfile_Localhost && sp.LocalhostRef != "" {
                return nil, errors.New("apparmor config invalid LocalhostRef must only be set if ProfileType is Localhost")
        }

        switch sp.ProfileType {
        case runtime.SecurityProfile_Unconfined:
                // Do not set apparmor profile.
                return nil, nil
        case runtime.SecurityProfile_RuntimeDefault:
                if privileged {
                        // Do not set apparmor profile when container is privileged
                        return nil, nil
                }
                // TODO (mikebrow): delete created apparmor default profile
                return apparmor.WithDefaultProfile(appArmorDefaultProfileName), nil
        case runtime.SecurityProfile_Localhost:
                // trimming the localhost/ prefix just in case even through it should not
                // be necessary with the new SecurityProfile struct
                appArmorProfile := strings.TrimPrefix(sp.LocalhostRef, profileNamePrefix)
                if profileExists, err := appArmorProfileExists(appArmorProfile); !profileExists {
                        if err != nil {
                                return nil, fmt.Errorf("failed to generate apparmor spec opts: %w", err)
                        }
                        return nil, fmt.Errorf("apparmor profile not found %s", appArmorProfile)
                }
                return apparmor.WithProfile(appArmorProfile), nil
        default:
                return nil, errors.New("apparmor unknown ProfileType")
        }
}

// appArmorProfileExists scans apparmor/profiles for the requested profile
func appArmorProfileExists(profile string) (bool, error) {
        if profile == "" {
                return false, errors.New("nil apparmor profile is not supported")
        }
        profiles, err := os.Open("/sys/kernel/security/apparmor/profiles")
        if err != nil {
                return false, err
        }
        defer profiles.Close()

        rbuff := bufio.NewReader(profiles)
        for {
                line, err := rbuff.ReadString('\n')
                switch err {
                case nil:
                        if strings.HasPrefix(line, profile+" (") {
                                return true, nil
                        }
                case io.EOF:
                        return false, nil
                default:
                        return false, err
                }
        }
}

// snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot
func snapshotterOpts(config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
        nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
        return snapshotterRemapOpts(nsOpts)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (c *criService) GetContainerEvents(r *runtime.GetEventsRequest, s runtime.RuntimeService_GetContainerEventsServer) error {
        eventC, closer := c.containerEventsQ.Subscribe()
        defer closer.Close()

        for event := range eventC {
                if err := s.Send(&event); err != nil {
                        return err
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// Exec prepares a streaming endpoint to execute a command in the container, and returns the address.
func (c *criService) Exec(ctx context.Context, r *runtime.ExecRequest) (*runtime.ExecResponse, error) {
        cntr, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("failed to find container %q in store: %w", r.GetContainerId(), err)
        }
        state := cntr.Status.Get().State()
        if state != runtime.ContainerState_CONTAINER_RUNNING {
                return nil, fmt.Errorf("container is in %s state", criContainerStateToString(state))
        }
        return c.streamServer.GetExec(r)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "bytes"
        "context"
        "fmt"
        "io"
        "syscall"
        "time"

        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "k8s.io/client-go/tools/remotecommand"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/internal/cri/config"
        cio "github.com/containerd/containerd/v2/internal/cri/io"
        "github.com/containerd/containerd/v2/internal/cri/util"
        containerdio "github.com/containerd/containerd/v2/pkg/cio"
        cioutil "github.com/containerd/containerd/v2/pkg/ioutil"
)

type cappedWriter struct {
        w      io.WriteCloser
        remain int
}

func (cw *cappedWriter) Write(p []byte) (int, error) {
        if cw.remain <= 0 {
                return len(p), nil
        }

        end := cw.remain
        if end > len(p) {
                end = len(p)
        }
        written, err := cw.w.Write(p[0:end])
        cw.remain -= written

        if err != nil {
                return written, err
        }
        return len(p), nil
}

func (cw *cappedWriter) Close() error {
        return cw.w.Close()
}

func (cw *cappedWriter) isFull() bool {
        return cw.remain <= 0
}

// ExecSync executes a command in the container, and returns the stdout output.
// If command exits with a non-zero exit code, an error is returned.
func (c *criService) ExecSync(ctx context.Context, r *runtime.ExecSyncRequest) (*runtime.ExecSyncResponse, error) {
        const maxStreamSize = 1024 * 1024 * 16

        var stdout, stderr bytes.Buffer

        // cappedWriter truncates the output. In that case, the size of
        // the ExecSyncResponse will hit the CRI plugin's gRPC response limit.
        // Thus the callers outside of the containerd process (e.g. Kubelet) never see
        // the truncated output.
        cout := &cappedWriter{w: cioutil.NewNopWriteCloser(&stdout), remain: maxStreamSize}
        cerr := &cappedWriter{w: cioutil.NewNopWriteCloser(&stderr), remain: maxStreamSize}

        exitCode, err := c.execInContainer(ctx, r.GetContainerId(), execOptions{
                cmd:     r.GetCmd(),
                stdout:  cout,
                stderr:  cerr,
                timeout: time.Duration(r.GetTimeout()) * time.Second,
        })
        if err != nil {
                return nil, fmt.Errorf("failed to exec in container: %w", err)
        }

        return &runtime.ExecSyncResponse{
                Stdout:   stdout.Bytes(),
                Stderr:   stderr.Bytes(),
                ExitCode: int32(*exitCode),
        }, nil
}

// execOptions specifies how to execute command in container.
type execOptions struct {
        cmd     []string
        stdin   io.Reader
        stdout  io.WriteCloser
        stderr  io.WriteCloser
        tty     bool
        resize  <-chan remotecommand.TerminalSize
        timeout time.Duration
}

func (c *criService) execInternal(ctx context.Context, container containerd.Container, id string, opts execOptions) (*uint32, error) {
        // Cancel the context before returning to ensure goroutines are stopped.
        // This is important, because if `Start` returns error, `Wait` will hang
        // forever unless we cancel the context.
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()

        var drainExecSyncIOTimeout time.Duration
        var err error

        if c.config.DrainExecSyncIOTimeout != "" {
                drainExecSyncIOTimeout, err = time.ParseDuration(c.config.DrainExecSyncIOTimeout)
                if err != nil {
                        return nil, fmt.Errorf("failed to parse drain_exec_sync_io_timeout %q: %w",
                                c.config.DrainExecSyncIOTimeout, err)
                }
        }

        spec, err := container.Spec(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to get container spec: %w", err)
        }
        task, err := container.Task(ctx, nil)
        if err != nil {
                return nil, fmt.Errorf("failed to load task: %w", err)
        }
        pspec := spec.Process

        pspec.Terminal = opts.tty
        if opts.tty {
                if err := oci.WithEnv([]string{"TERM=xterm"})(ctx, nil, nil, spec); err != nil {
                        return nil, fmt.Errorf("add TERM env var to spec: %w", err)
                }
        }

        pspec.Args = opts.cmd
        // CommandLine may already be set on the container's spec, but we want to only use Args here.
        pspec.CommandLine = ""

        if opts.stdout == nil {
                opts.stdout = cio.NewDiscardLogger()
        }
        if opts.stderr == nil {
                opts.stderr = cio.NewDiscardLogger()
        }
        execID := util.GenerateID()
        log.G(ctx).Debugf("Generated exec id %q for container %q", execID, id)
        volatileRootDir := c.getVolatileContainerRootDir(id)
        var execIO *cio.ExecIO

        process, err := task.Exec(ctx, execID, pspec,
                func(id string) (containerdio.IO, error) {
                        cntr, err := c.containerStore.Get(container.ID())
                        if err != nil {
                                return nil, fmt.Errorf("an error occurred when try to find container %q: %w", container.ID(), err)
                        }
                        sb, err := c.sandboxStore.Get(cntr.SandboxID)
                        if err != nil {
                                return nil, fmt.Errorf("an error occurred when try to find sandbox %q: %w", cntr.SandboxID, err)
                        }
                        ociRuntime, err := c.config.GetSandboxRuntime(sb.Config, sb.Metadata.RuntimeHandler)
                        if err != nil {
                                return nil, fmt.Errorf("failed to get sandbox runtime: %w", err)
                        }
                        switch ociRuntime.IOType {
                        case config.IOTypeStreaming:
                                execIO, err = cio.NewStreamExecIO(id, sb.Endpoint.Address, opts.tty, opts.stdin != nil)
                        default:
                                execIO, err = cio.NewFifoExecIO(id, volatileRootDir, opts.tty, opts.stdin != nil)
                        }

                        return execIO, err
                },
        )
        if err != nil {
                return nil, fmt.Errorf("failed to create exec %q: %w", execID, err)
        }
        defer func() {
                deferCtx, deferCancel := util.DeferContext()
                defer deferCancel()
                if _, err := process.Delete(deferCtx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
                        log.G(ctx).WithError(err).Errorf("Failed to delete exec process %q for container %q", execID, id)
                }
        }()

        exitCh, err := process.Wait(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to wait for process %q: %w", execID, err)
        }
        if err := process.Start(ctx); err != nil {
                return nil, fmt.Errorf("failed to start exec %q: %w", execID, err)
        }

        handleResizing(ctx, opts.resize, func(size remotecommand.TerminalSize) {
                if err := process.Resize(ctx, uint32(size.Width), uint32(size.Height)); err != nil {
                        log.G(ctx).WithError(err).Errorf("Failed to resize process %q console for container %q", execID, id)
                }
        })

        attachDone := execIO.Attach(cio.AttachOptions{
                Stdin:     opts.stdin,
                Stdout:    opts.stdout,
                Stderr:    opts.stderr,
                Tty:       opts.tty,
                StdinOnce: true,
                CloseStdin: func() error {
                        return process.CloseIO(ctx, containerd.WithStdinCloser)
                },
        })

        execCtx := ctx
        if opts.timeout > 0 {
                var execCtxCancel context.CancelFunc
                execCtx, execCtxCancel = context.WithTimeout(ctx, opts.timeout)
                defer execCtxCancel()
        }

        select {
        case <-execCtx.Done():
                // Ignore the not found error because the process may exit itself before killing.
                if err := process.Kill(ctx, syscall.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("failed to kill exec %q: %w", execID, err)
                }
                // Wait for the process to be killed.
                exitRes := <-exitCh
                log.G(ctx).Debugf("Timeout received while waiting for exec process kill %q code %d and error %v",
                        execID, exitRes.ExitCode(), exitRes.Error())

                if err := drainExecSyncIO(ctx, process, drainExecSyncIOTimeout, attachDone); err != nil {
                        log.G(ctx).WithError(err).Warnf("failed to drain exec process %q io", execID)
                }

                return nil, fmt.Errorf("timeout %v exceeded: %w", opts.timeout, execCtx.Err())
        case exitRes := <-exitCh:
                code, _, err := exitRes.Result()
                log.G(ctx).Debugf("Exec process %q exits with exit code %d and error %v", execID, code, err)
                if err != nil {
                        return nil, fmt.Errorf("failed while waiting for exec %q: %w", execID, err)
                }

                if err := drainExecSyncIO(ctx, process, drainExecSyncIOTimeout, attachDone); err != nil {
                        return nil, fmt.Errorf("failed to drain exec process %q io: %w", execID, err)
                }
                return &code, nil
        }
}

// execInContainer executes a command inside the container synchronously, and
// redirects stdio stream properly.
// This function only returns when the exec process exits, this means that:
// 1) As long as the exec process is running, the goroutine in the cri plugin
// will be running and wait for the exit code;
// 2) `kubectl exec -it` will hang until the exec process exits, even after io
// is detached. This is different from dockershim, which leaves the exec process
// running in background after io is detached.
// https://github.com/kubernetes/kubernetes/blob/v1.15.0/pkg/kubelet/dockershim/exec.go#L127
// For example, if the `kubectl exec -it` process is killed, IO will be closed. In
// this case, the CRI plugin will still have a goroutine waiting for the exec process
// to exit and log the exit code, but dockershim won't.
func (c *criService) execInContainer(ctx context.Context, id string, opts execOptions) (*uint32, error) {
        // Get container from our container store.
        cntr, err := c.containerStore.Get(id)

        if err != nil {
                return nil, fmt.Errorf("failed to find container %q in store: %w", id, err)
        }
        id = cntr.ID

        state := cntr.Status.Get().State()
        if state != runtime.ContainerState_CONTAINER_RUNNING {
                return nil, fmt.Errorf("container is in %s state", criContainerStateToString(state))
        }

        return c.execInternal(ctx, cntr.Container, id, opts)
}

// drainExecSyncIO drains process IO with timeout after exec init process exits.
//
// By default, the child processes spawned by exec process will inherit standard
// io file descriptors. The shim server creates a pipe as data channel. Both
// exec process and its children write data into the write end of the pipe.
// And the shim server will read data from the pipe. If the write end is still
// open, the shim server will continue to wait for data from pipe.
//
// If the exec command is like `bash -c "sleep 365d &"`, the exec process
// is bash and quit after create `sleep 365d`. But the `sleep 365d` will hold
// the write end of the pipe for a year! It doesn't make senses that CRI plugin
// should wait for it.
func drainExecSyncIO(ctx context.Context, execProcess containerd.Process, drainExecIOTimeout time.Duration, attachDone <-chan struct{}) error {
        var timerCh <-chan time.Time

        if drainExecIOTimeout != 0 {
                timer := time.NewTimer(drainExecIOTimeout)
                defer timer.Stop()

                timerCh = timer.C
        }

        select {
        case <-timerCh:
        case <-attachDone:
                log.G(ctx).Tracef("Stream pipe for exec process %q done", execProcess.ID())
                return nil
        }

        log.G(ctx).Debugf("Exec process %q exits but the io is still held by other processes. Trying to delete exec process to release io", execProcess.ID())
        _, err := execProcess.Delete(ctx, containerd.WithProcessKill)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to release exec io by deleting exec process %q: %w",
                                execProcess.ID(), err)
                }
        }
        return fmt.Errorf("failed to drain exec process %q io in %s because io is still held by other processes",
                execProcess.ID(), drainExecIOTimeout)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "time"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
)

// ListContainers lists all containers matching the filter.
func (c *criService) ListContainers(ctx context.Context, r *runtime.ListContainersRequest) (*runtime.ListContainersResponse, error) {
        start := time.Now()
        // List all containers from store.
        containersInStore := c.containerStore.List()

        var containers []*runtime.Container
        for _, container := range containersInStore {
                containers = append(containers, toCRIContainer(container))
        }

        containers = c.filterCRIContainers(containers, r.GetFilter())

        containerListTimer.UpdateSince(start)
        return &runtime.ListContainersResponse{Containers: containers}, nil
}

// toCRIContainer converts internal container object into CRI container.
func toCRIContainer(container containerstore.Container) *runtime.Container {
        status := container.Status.Get()
        return &runtime.Container{
                Id:           container.ID,
                PodSandboxId: container.SandboxID,
                Metadata:     container.Config.GetMetadata(),
                Image:        container.Config.GetImage(),
                ImageRef:     container.ImageRef,
                State:        status.State(),
                CreatedAt:    status.CreatedAt,
                Labels:       container.Config.GetLabels(),
                Annotations:  container.Config.GetAnnotations(),
        }
}

func (c *criService) normalizeContainerFilter(filter *runtime.ContainerFilter) {
        if cntr, err := c.containerStore.Get(filter.GetId()); err == nil {
                filter.Id = cntr.ID
        }
        if sb, err := c.sandboxStore.Get(filter.GetPodSandboxId()); err == nil {
                filter.PodSandboxId = sb.ID
        }
}

// filterCRIContainers filters CRIContainers.
func (c *criService) filterCRIContainers(containers []*runtime.Container, filter *runtime.ContainerFilter) []*runtime.Container {
        if filter == nil {
                return containers
        }

        // The containerd cri plugin supports short ids so long as there is only one
        // match. So we do a lookup against the store here if a pod id has been
        // included in the filter.
        sb := filter.GetPodSandboxId()
        if sb != "" {
                sandbox, err := c.sandboxStore.Get(sb)
                if err == nil {
                        sb = sandbox.ID
                }
        }

        c.normalizeContainerFilter(filter)
        filtered := []*runtime.Container{}
        for _, cntr := range containers {
                if filter.GetId() != "" && filter.GetId() != cntr.Id {
                        continue
                }
                if sb != "" && sb != cntr.PodSandboxId {
                        continue
                }
                if filter.GetState() != nil && filter.GetState().GetState() != cntr.State {
                        continue
                }
                if filter.GetLabelSelector() != nil {
                        match := true
                        for k, v := range filter.GetLabelSelector() {
                                got, ok := cntr.Labels[k]
                                if !ok || got != v {
                                        match = false
                                        break
                                }
                        }
                        if !match {
                                continue
                        }
                }
                filtered = append(filtered, cntr)
        }

        return filtered
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ReopenContainerLog asks the cri plugin to reopen the stdout/stderr log file for the container.
// This is often called after the log file has been rotated.
func (c *criService) ReopenContainerLog(ctx context.Context, r *runtime.ReopenContainerLogRequest) (*runtime.ReopenContainerLogResponse, error) {
        container, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find container %q: %w", r.GetContainerId(), err)
        }

        if container.Status.Get().State() != runtime.ContainerState_CONTAINER_RUNNING {
                return nil, errors.New("container is not running")
        }

        // Create new container logger and replace the existing ones.
        stdoutWC, stderrWC, err := c.createContainerLoggers(container.LogPath, container.Config.GetTty())
        if err != nil {
                return nil, err
        }
        oldStdoutWC, oldStderrWC := container.IO.AddOutput("log", stdoutWC, stderrWC)
        if oldStdoutWC != nil {
                oldStdoutWC.Close()
        }
        if oldStderrWC != nil {
                oldStderrWC.Close()
        }
        return &runtime.ReopenContainerLogResponse{}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"
        "time"

        containerd "github.com/containerd/containerd/v2/client"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// RemoveContainer removes the container.
func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (_ *runtime.RemoveContainerResponse, retErr error) {
        start := time.Now()
        ctrID := r.GetContainerId()
        container, err := c.containerStore.Get(ctrID)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("an error occurred when try to find container %q: %w", ctrID, err)
                }
                // Do not return error if container metadata doesn't exist.
                log.G(ctx).Tracef("RemoveContainer called for container %q that does not exist", ctrID)
                return &runtime.RemoveContainerResponse{}, nil
        }
        id := container.ID
        i, err := container.Container.Info(ctx)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("get container info: %w", err)
                }
                // Since containerd doesn't see the container and criservice's content store does,
                // we should try to recover from this state by removing entry for this container
                // from the container store as well and return successfully.
                log.G(ctx).WithError(err).Warn("get container info failed")
                c.containerStore.Delete(ctrID)
                c.containerNameIndex.ReleaseByKey(ctrID)
                return &runtime.RemoveContainerResponse{}, nil
        }

        // Forcibly stop the containers if they are in running or unknown state
        state := container.Status.Get().State()
        if state == runtime.ContainerState_CONTAINER_RUNNING ||
                state == runtime.ContainerState_CONTAINER_UNKNOWN {
                log.L.Infof("Forcibly stopping container %q", id)
                if err := c.stopContainer(ctx, container, 0); err != nil {
                        return nil, fmt.Errorf("failed to forcibly stop container %q: %w", id, err)
                }

        }

        // Set removing state to prevent other start/remove operations against this container
        // while it's being removed.
        if err := setContainerRemoving(container); err != nil {
                return nil, fmt.Errorf("failed to set removing state for container %q: %w", id, err)
        }
        defer func() {
                if retErr != nil {
                        // Reset removing if remove failed.
                        if err := resetContainerRemoving(container); err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to reset removing state for container %q", id)
                        }
                }
        }()

        sandbox, err := c.sandboxStore.Get(container.SandboxID)
        if err != nil {
                err = c.nri.RemoveContainer(ctx, nil, &container)
        } else {
                err = c.nri.RemoveContainer(ctx, &sandbox, &container)
        }
        if err != nil {
                log.G(ctx).WithError(err).Error("NRI failed to remove container")
        }

        // NOTE(random-liu): Docker set container to "Dead" state when start removing the
        // container so as to avoid start/restart the container again. However, for current
        // kubelet implementation, we'll never start a container once we decide to remove it,
        // so we don't need the "Dead" state for now.

        // Delete containerd container.
        if err := container.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil {
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("failed to delete containerd container %q: %w", id, err)
                }
                log.G(ctx).Tracef("Remove called for containerd container %q that does not exist", id)
        }

        // Delete container checkpoint.
        if err := container.Delete(); err != nil {
                return nil, fmt.Errorf("failed to delete container checkpoint for %q: %w", id, err)
        }

        containerRootDir := c.getContainerRootDir(id)
        if err := ensureRemoveAll(ctx, containerRootDir); err != nil {
                return nil, fmt.Errorf("failed to remove container root directory %q: %w",
                        containerRootDir, err)
        }
        volatileContainerRootDir := c.getVolatileContainerRootDir(id)
        if err := ensureRemoveAll(ctx, volatileContainerRootDir); err != nil {
                return nil, fmt.Errorf("failed to remove volatile container root directory %q: %w",
                        volatileContainerRootDir, err)
        }

        c.containerStore.Delete(id)

        c.containerNameIndex.ReleaseByKey(id)

        c.generateAndSendContainerEvent(ctx, id, container.SandboxID, runtime.ContainerEventType_CONTAINER_DELETED_EVENT)

        containerRemoveTimer.WithValues(i.Runtime.Name).UpdateSince(start)

        return &runtime.RemoveContainerResponse{}, nil
}

// setContainerRemoving sets the container into removing state. In removing state, the
// container will not be started or removed again.
func setContainerRemoving(container containerstore.Container) error {
        return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
                // Do not remove container if it's still running or unknown.
                if status.State() == runtime.ContainerState_CONTAINER_RUNNING {
                        return status, errors.New("container is still running, to stop first")
                }
                if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
                        return status, errors.New("container state is unknown, to stop first")
                }
                if status.Starting {
                        return status, errors.New("container is in starting state, can't be removed")
                }
                if status.Removing {
                        return status, errors.New("container is already in removing state")
                }
                status.Removing = true
                return status, nil
        })
}

// resetContainerRemoving resets the container removing state on remove failure. So
// that we could remove the container again.
func resetContainerRemoving(container containerstore.Container) error {
        return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
                status.Removing = false
                return status, nil
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"
        "io"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        cio "github.com/containerd/containerd/v2/internal/cri/io"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        containerdio "github.com/containerd/containerd/v2/pkg/cio"
        cioutil "github.com/containerd/containerd/v2/pkg/ioutil"
)

// StartContainer starts the container.
func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
        start := time.Now()
        cntr, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find container %q: %w", r.GetContainerId(), err)
        }

        info, err := cntr.Container.Info(ctx)
        if err != nil {
                return nil, fmt.Errorf("get container info: %w", err)
        }

        id := cntr.ID
        meta := cntr.Metadata
        container := cntr.Container
        config := meta.Config

        // Set starting state to prevent other start/remove operations against this container
        // while it's being started.
        if err := setContainerStarting(cntr); err != nil {
                return nil, fmt.Errorf("failed to set starting state for container %q: %w", id, err)
        }
        defer func() {
                if retErr != nil {
                        // Set container to exited if fail to start.
                        if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
                                status.Pid = 0
                                status.FinishedAt = time.Now().UnixNano()
                                status.ExitCode = errorStartExitCode
                                status.Reason = errorStartReason
                                status.Message = retErr.Error()
                                return status, nil
                        }); err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to set start failure state for container %q", id)
                        }
                }
                if err := resetContainerStarting(cntr); err != nil {
                        log.G(ctx).WithError(err).Errorf("failed to reset starting state for container %q", id)
                }
        }()

        // Get sandbox config from sandbox store.
        sandbox, err := c.sandboxStore.Get(meta.SandboxID)
        if err != nil {
                return nil, fmt.Errorf("sandbox %q not found: %w", meta.SandboxID, err)
        }
        sandboxID := meta.SandboxID
        if sandbox.Status.Get().State != sandboxstore.StateReady {
                return nil, fmt.Errorf("sandbox container %q is not running", sandboxID)
        }

        // Recheck target container validity in Linux namespace options.
        if linux := config.GetLinux(); linux != nil {
                nsOpts := linux.GetSecurityContext().GetNamespaceOptions()
                if nsOpts.GetPid() == runtime.NamespaceMode_TARGET {
                        _, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId)
                        if err != nil {
                                return nil, fmt.Errorf("invalid target container: %w", err)
                        }
                }
        }

        ioCreation := func(id string) (_ containerdio.IO, err error) {
                stdoutWC, stderrWC, err := c.createContainerLoggers(meta.LogPath, config.GetTty())
                if err != nil {
                        return nil, fmt.Errorf("failed to create container loggers: %w", err)
                }
                cntr.IO.AddOutput("log", stdoutWC, stderrWC)
                cntr.IO.Pipe()
                return cntr.IO, nil
        }

        ociRuntime, err := c.config.GetSandboxRuntime(sandbox.Config, sandbox.Metadata.RuntimeHandler)
        if err != nil {
                return nil, fmt.Errorf("failed to get sandbox runtime: %w", err)
        }

        var taskOpts []containerd.NewTaskOpts
        if ociRuntime.Path != "" {
                taskOpts = append(taskOpts, containerd.WithRuntimePath(ociRuntime.Path))
        }

        // append endpoint to the options so that task manager can get task api endpoint directly
        endpoint := sandbox.Endpoint
        if endpoint.IsValid() {
                taskOpts = append(taskOpts,
                        containerd.WithTaskAPIEndpoint(endpoint.Address, endpoint.Version))
        }

        task, err := container.NewTask(ctx, ioCreation, taskOpts...)
        if err != nil {
                return nil, fmt.Errorf("failed to create containerd task: %w", err)
        }
        defer func() {
                if retErr != nil {
                        deferCtx, deferCancel := ctrdutil.DeferContext()
                        defer deferCancel()
                        // It's possible that task is deleted by event monitor.
                        if _, err := task.Delete(deferCtx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
                                log.G(ctx).WithError(err).Errorf("Failed to delete containerd task %q", id)
                        }
                }
        }()

        // wait is a long running background request, no timeout needed.
        exitCh, err := task.Wait(ctrdutil.NamespacedContext())
        if err != nil {
                return nil, fmt.Errorf("failed to wait for containerd task: %w", err)
        }

        defer func() {
                if retErr != nil {
                        deferCtx, deferCancel := ctrdutil.DeferContext()
                        defer deferCancel()
                        err = c.nri.StopContainer(deferCtx, &sandbox, &cntr)
                        if err != nil {
                                log.G(ctx).WithError(err).Errorf("NRI stop failed for failed container %q", id)
                        }
                }
        }()

        err = c.nri.StartContainer(ctx, &sandbox, &cntr)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("NRI container start failed")
                return nil, fmt.Errorf("NRI container start failed: %w", err)
        }

        // Start containerd task.
        if err := task.Start(ctx); err != nil {
                return nil, fmt.Errorf("failed to start containerd task %q: %w", id, err)
        }

        // Update container start timestamp.
        if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
                status.Pid = task.Pid()
                status.StartedAt = time.Now().UnixNano()
                return status, nil
        }); err != nil {
                return nil, fmt.Errorf("failed to update container %q state: %w", id, err)
        }

        // It handles the TaskExit event and update container state after this.
        c.startContainerExitMonitor(context.Background(), id, task.Pid(), exitCh)

        c.generateAndSendContainerEvent(ctx, id, sandboxID, runtime.ContainerEventType_CONTAINER_STARTED_EVENT)

        err = c.nri.PostStartContainer(ctx, &sandbox, &cntr)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("NRI post-start notification failed")
        }

        containerStartTimer.WithValues(info.Runtime.Name).UpdateSince(start)

        return &runtime.StartContainerResponse{}, nil
}

// setContainerStarting sets the container into starting state. In starting state, the
// container will not be removed or started again.
func setContainerStarting(container containerstore.Container) error {
        return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
                // Return error if container is not in created state.
                if status.State() != runtime.ContainerState_CONTAINER_CREATED {
                        return status, fmt.Errorf("container is in %s state", criContainerStateToString(status.State()))
                }
                // Do not start the container when there is a removal in progress.
                if status.Removing {
                        return status, errors.New("container is in removing state, can't be started")
                }
                if status.Starting {
                        return status, errors.New("container is already in starting state")
                }
                status.Starting = true
                return status, nil
        })
}

// resetContainerStarting resets the container starting state on start failure. So
// that we could remove the container later.
func resetContainerStarting(container containerstore.Container) error {
        return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
                status.Starting = false
                return status, nil
        })
}

// createContainerLoggers creates container loggers and return write closer for stdout and stderr.
func (c *criService) createContainerLoggers(logPath string, tty bool) (stdout io.WriteCloser, stderr io.WriteCloser, err error) {
        if logPath != "" {
                // Only generate container log when log path is specified.
                f, err := openLogFile(logPath)
                if err != nil {
                        return nil, nil, fmt.Errorf("failed to create and open log file: %w", err)
                }
                defer func() {
                        if err != nil {
                                f.Close()
                        }
                }()
                var stdoutCh, stderrCh <-chan struct{}
                wc := cioutil.NewSerialWriteCloser(f)
                stdout, stdoutCh = cio.NewCRILogger(logPath, wc, cio.Stdout, c.config.MaxContainerLogLineSize)
                // Only redirect stderr when there is no tty.
                if !tty {
                        stderr, stderrCh = cio.NewCRILogger(logPath, wc, cio.Stderr, c.config.MaxContainerLogLineSize)
                }
                go func() {
                        if stdoutCh != nil {
                                <-stdoutCh
                        }
                        if stderrCh != nil {
                                <-stderrCh
                        }
                        log.L.Debugf("Finish redirecting log file %q, closing it", logPath)
                        f.Close()
                }()
        } else {
                stdout = cio.NewDiscardLogger()
                stderr = cio.NewDiscardLogger()
        }
        return
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"

        "github.com/containerd/containerd/api/services/tasks/v1"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ContainerStats returns stats of the container. If the container does not
// exist, the call returns an error.
func (c *criService) ContainerStats(ctx context.Context, in *runtime.ContainerStatsRequest) (*runtime.ContainerStatsResponse, error) {
        cntr, err := c.containerStore.Get(in.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("failed to find container: %w", err)
        }
        request := &tasks.MetricsRequest{Filters: []string{"id==" + cntr.ID}}
        resp, err := c.client.TaskService().Metrics(ctx, request)
        if err != nil {
                return nil, fmt.Errorf("failed to fetch metrics for task: %w", err)
        }
        if len(resp.Metrics) != 1 {
                return nil, fmt.Errorf("unexpected metrics response: %+v", resp.Metrics)
        }

        handler, err := c.getMetricsHandler(ctx, cntr.SandboxID)
        if err != nil {
                return nil, err
        }

        cs, err := handler(cntr.Metadata, resp.Metrics[0])
        if err != nil {
                return nil, fmt.Errorf("failed to decode container metrics: %w", err)
        }
        return &runtime.ContainerStatsResponse{Stats: cs}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"
        "reflect"
        "time"

        wstats "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats"
        cg1 "github.com/containerd/cgroups/v3/cgroup1/stats"
        cg2 "github.com/containerd/cgroups/v3/cgroup2/stats"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/api/types"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        "github.com/containerd/containerd/v2/internal/cri/store/stats"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
)

// ListContainerStats returns stats of all running containers.
func (c *criService) ListContainerStats(
        ctx context.Context,
        in *runtime.ListContainerStatsRequest,
) (*runtime.ListContainerStatsResponse, error) {
        request, containers, err := c.buildTaskMetricsRequest(in)
        if err != nil {
                return nil, fmt.Errorf("failed to build metrics request: %w", err)
        }
        resp, err := c.client.TaskService().Metrics(ctx, request)
        if err != nil {
                return nil, fmt.Errorf("failed to fetch metrics for tasks: %w", err)
        }
        criStats, err := c.toCRIContainerStats(ctx, resp.Metrics, containers)
        if err != nil {
                return nil, fmt.Errorf("failed to convert to cri containerd stats format: %w", err)
        }
        return criStats, nil
}

type metricsHandler func(containerstore.Metadata, *types.Metric) (*runtime.ContainerStats, error)

// Returns a function to be used for transforming container metrics into the right format.
// Uses the platform the given sandbox advertises to implement its logic. If the platform is
// unsupported for metrics this will return a wrapped [errdefs.ErrNotImplemented].
func (c *criService) getMetricsHandler(ctx context.Context, sandboxID string) (metricsHandler, error) {
        sandbox, err := c.sandboxStore.Get(sandboxID)
        if err != nil {
                return nil, fmt.Errorf("failed to find sandbox id %q: %w", sandboxID, err)
        }

        // Grab the platform that this containers sandbox advertises. Reason being, even if
        // the host may be {insert platform}, if it virtualizes or emulates a different platform
        // it will return stats in that format, and we need to handle the conversion logic based
        // off of this info.
        p, err := c.sandboxService.SandboxPlatform(ctx, sandbox.Sandboxer, sandboxID)
        if err != nil {
                return nil, err
        }

        ociRuntime, err := c.config.GetSandboxRuntime(sandbox.Config, sandbox.RuntimeHandler)
        if err != nil {
                return nil, fmt.Errorf("failed to get runtimeHandler %q: %w", sandbox.RuntimeHandler, err)
        }
        snapshotter := c.RuntimeSnapshotter(ctx, ociRuntime)

        switch p.OS {
        case "windows":
                return func(meta containerstore.Metadata, stats *types.Metric) (*runtime.ContainerStats, error) {
                        return c.windowsContainerMetrics(meta, stats, snapshotter)
                }, nil
        case "linux":
                return func(meta containerstore.Metadata, stats *types.Metric) (*runtime.ContainerStats, error) {
                        return c.linuxContainerMetrics(meta, stats, snapshotter)
                }, nil
        default:
                return nil, fmt.Errorf("container metrics for platform %+v: %w", p, errdefs.ErrNotImplemented)
        }
}

func (c *criService) toCRIContainerStats(
        ctx context.Context,
        stats []*types.Metric,
        containers []containerstore.Container,
) (*runtime.ListContainerStatsResponse, error) {
        statsMap := make(map[string]*types.Metric)
        for _, stat := range stats {
                statsMap[stat.ID] = stat
        }
        containerStats := new(runtime.ListContainerStatsResponse)

        // Unfortunately if no filter was passed we're asking for every containers stats which
        // generally belong to multiple different pods, who all might have different platforms.
        // To avoid recalculating the right metricsHandler to invoke, if we've already calculated
        // the platform and handler for a given sandbox just pull it from our map here.
        var (
                err     error
                handler metricsHandler
        )
        sandboxToMetricsHandler := make(map[string]metricsHandler)
        for _, cntr := range containers {
                h, ok := sandboxToMetricsHandler[cntr.SandboxID]
                if !ok {
                        handler, err = c.getMetricsHandler(ctx, cntr.SandboxID)
                        if err != nil {
                                // If the sandbox is not found, it may have been removed. we need to check container whether it is still exist
                                if errdefs.IsNotFound(err) {
                                        _, err = c.containerStore.Get(cntr.ID)
                                        if err != nil && errdefs.IsNotFound(err) {
                                                log.G(ctx).Warnf("container %q is not found, skip it", cntr.ID)
                                                continue
                                        }
                                }
                                return nil, fmt.Errorf("failed to get metrics handler for container %q: %w", cntr.ID, err)
                        }
                        sandboxToMetricsHandler[cntr.SandboxID] = handler
                } else {
                        handler = h
                }

                cs, err := handler(cntr.Metadata, statsMap[cntr.ID])
                if err != nil {
                        return nil, fmt.Errorf("failed to decode container metrics for %q: %w", cntr.ID, err)
                }

                if cs.Cpu != nil && cs.Cpu.UsageCoreNanoSeconds != nil {
                        // this is a calculated value and should be computed for all OSes
                        nanoUsage, err := c.getUsageNanoCores(cntr.Metadata.ID, false, cs.Cpu.UsageCoreNanoSeconds.Value, time.Unix(0, cs.Cpu.Timestamp))
                        if err != nil {
                                return nil, fmt.Errorf("failed to get usage nano cores, containerID: %s: %w", cntr.Metadata.ID, err)
                        }
                        cs.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoUsage}
                }
                containerStats.Stats = append(containerStats.Stats, cs)
        }
        return containerStats, nil
}

func (c *criService) getUsageNanoCores(containerID string, isSandbox bool, currentUsageCoreNanoSeconds uint64, currentTimestamp time.Time) (uint64, error) {
        var oldStats *stats.ContainerStats

        if isSandbox {
                sandbox, err := c.sandboxStore.Get(containerID)
                if err != nil {
                        return 0, fmt.Errorf("failed to get sandbox container: %s: %w", containerID, err)
                }
                oldStats = sandbox.Stats
        } else {
                container, err := c.containerStore.Get(containerID)
                if err != nil {
                        return 0, fmt.Errorf("failed to get container ID: %s: %w", containerID, err)
                }
                oldStats = container.Stats
        }

        if oldStats == nil {
                newStats := &stats.ContainerStats{
                        UsageCoreNanoSeconds: currentUsageCoreNanoSeconds,
                        Timestamp:            currentTimestamp,
                }
                if isSandbox {
                        err := c.sandboxStore.UpdateContainerStats(containerID, newStats)
                        if err != nil {
                                return 0, fmt.Errorf("failed to update sandbox stats container ID: %s: %w", containerID, err)
                        }
                } else {
                        err := c.containerStore.UpdateContainerStats(containerID, newStats)
                        if err != nil {
                                return 0, fmt.Errorf("failed to update container stats ID: %s: %w", containerID, err)
                        }
                }
                return 0, nil
        }

        nanoSeconds := currentTimestamp.UnixNano() - oldStats.Timestamp.UnixNano()

        // zero or negative interval
        if nanoSeconds <= 0 {
                return 0, nil
        }

        newUsageNanoCores := uint64(float64(currentUsageCoreNanoSeconds-oldStats.UsageCoreNanoSeconds) /
                float64(nanoSeconds) * float64(time.Second/time.Nanosecond))

        newStats := &stats.ContainerStats{
                UsageCoreNanoSeconds: currentUsageCoreNanoSeconds,
                Timestamp:            currentTimestamp,
        }
        if isSandbox {
                err := c.sandboxStore.UpdateContainerStats(containerID, newStats)
                if err != nil {
                        return 0, fmt.Errorf("failed to update sandbox container stats: %s: %w", containerID, err)
                }
        } else {
                err := c.containerStore.UpdateContainerStats(containerID, newStats)
                if err != nil {
                        return 0, fmt.Errorf("failed to update container stats ID: %s: %w", containerID, err)
                }
        }

        return newUsageNanoCores, nil
}

func (c *criService) normalizeContainerStatsFilter(filter *runtime.ContainerStatsFilter) {
        if cntr, err := c.containerStore.Get(filter.GetId()); err == nil {
                filter.Id = cntr.ID
        }
        if sb, err := c.sandboxStore.Get(filter.GetPodSandboxId()); err == nil {
                filter.PodSandboxId = sb.ID
        }
}

// buildTaskMetricsRequest constructs a tasks.MetricsRequest based on
// the information in the stats request and the containerStore
func (c *criService) buildTaskMetricsRequest(
        r *runtime.ListContainerStatsRequest,
) (*tasks.MetricsRequest, []containerstore.Container, error) {
        req := &tasks.MetricsRequest{}
        if r.GetFilter() == nil {
                return req, c.containerStore.List(), nil
        }
        c.normalizeContainerStatsFilter(r.GetFilter())
        var containers []containerstore.Container
        for _, cntr := range c.containerStore.List() {
                if r.GetFilter().GetId() != "" && cntr.ID != r.GetFilter().GetId() {
                        continue
                }
                if r.GetFilter().GetPodSandboxId() != "" && cntr.SandboxID != r.GetFilter().GetPodSandboxId() {
                        continue
                }
                if r.GetFilter().GetLabelSelector() != nil &&
                        !matchLabelSelector(r.GetFilter().GetLabelSelector(), cntr.Config.GetLabels()) {
                        continue
                }
                containers = append(containers, cntr)
                req.Filters = append(req.Filters, "id=="+cntr.ID)
        }
        return req, containers, nil
}

func matchLabelSelector(selector, labels map[string]string) bool {
        for k, v := range selector {
                if val, ok := labels[k]; ok {
                        if v != val {
                                return false
                        }
                } else {
                        return false
                }
        }
        return true
}

func (c *criService) windowsContainerMetrics(
        meta containerstore.Metadata,
        stats *types.Metric,
        snapshotter string,
) (*runtime.ContainerStats, error) {
        var cs runtime.ContainerStats
        var usedBytes, inodesUsed uint64
        sn, err := c.GetSnapshot(meta.ID, snapshotter)
        // If snapshotstore doesn't have cached snapshot information
        // set WritableLayer usage to zero
        if err == nil {
                usedBytes = sn.Size
                inodesUsed = sn.Inodes
        }
        cs.WritableLayer = &runtime.FilesystemUsage{
                Timestamp: sn.Timestamp,
                FsId: &runtime.FilesystemIdentifier{
                        Mountpoint: c.imageFSPaths[snapshotter],
                },
                UsedBytes:  &runtime.UInt64Value{Value: usedBytes},
                InodesUsed: &runtime.UInt64Value{Value: inodesUsed},
        }
        cs.Attributes = &runtime.ContainerAttributes{
                Id:          meta.ID,
                Metadata:    meta.Config.GetMetadata(),
                Labels:      meta.Config.GetLabels(),
                Annotations: meta.Config.GetAnnotations(),
        }

        if stats != nil {
                s, err := typeurl.UnmarshalAny(stats.Data)
                if err != nil {
                        return nil, fmt.Errorf("failed to extract container metrics: %w", err)
                }
                wstats := s.(*wstats.Statistics).GetWindows()
                if wstats == nil {
                        return nil, errors.New("windows stats is empty")
                }
                if wstats.Processor != nil {
                        cs.Cpu = &runtime.CpuUsage{
                                Timestamp:            (protobuf.FromTimestamp(wstats.Timestamp)).UnixNano(),
                                UsageCoreNanoSeconds: &runtime.UInt64Value{Value: wstats.Processor.TotalRuntimeNS},
                        }
                }
                if wstats.Memory != nil {
                        cs.Memory = &runtime.MemoryUsage{
                                Timestamp: (protobuf.FromTimestamp(wstats.Timestamp)).UnixNano(),
                                WorkingSetBytes: &runtime.UInt64Value{
                                        Value: wstats.Memory.MemoryUsagePrivateWorkingSetBytes,
                                },
                        }
                }
        }
        return &cs, nil
}

func (c *criService) linuxContainerMetrics(
        meta containerstore.Metadata,
        stats *types.Metric,
        snapshotter string,
) (*runtime.ContainerStats, error) {
        var cs runtime.ContainerStats
        var usedBytes, inodesUsed uint64
        sn, err := c.GetSnapshot(meta.ID, snapshotter)
        // If snapshotstore doesn't have cached snapshot information
        // set WritableLayer usage to zero
        if err == nil {
                usedBytes = sn.Size
                inodesUsed = sn.Inodes
        }
        cs.WritableLayer = &runtime.FilesystemUsage{
                Timestamp: sn.Timestamp,
                FsId: &runtime.FilesystemIdentifier{
                        Mountpoint: c.imageFSPaths[snapshotter],
                },
                UsedBytes:  &runtime.UInt64Value{Value: usedBytes},
                InodesUsed: &runtime.UInt64Value{Value: inodesUsed},
        }
        cs.Attributes = &runtime.ContainerAttributes{
                Id:          meta.ID,
                Metadata:    meta.Config.GetMetadata(),
                Labels:      meta.Config.GetLabels(),
                Annotations: meta.Config.GetAnnotations(),
        }

        if stats != nil {
                var data interface{}
                switch {
                case typeurl.Is(stats.Data, (*cg1.Metrics)(nil)):
                        data = &cg1.Metrics{}
                case typeurl.Is(stats.Data, (*cg2.Metrics)(nil)):
                        data = &cg2.Metrics{}
                case typeurl.Is(stats.Data, (*wstats.Statistics)(nil)):
                        data = &wstats.Statistics{}
                default:
                        return nil, errors.New("cannot convert metric data to cgroups.Metrics or windows.Statistics")
                }

                if err := typeurl.UnmarshalTo(stats.Data, data); err != nil {
                        return nil, fmt.Errorf("failed to extract container metrics: %w", err)
                }

                cpuStats, err := c.cpuContainerStats(meta.ID, false /* isSandbox */, data, protobuf.FromTimestamp(stats.Timestamp))
                if err != nil {
                        return nil, fmt.Errorf("failed to obtain cpu stats: %w", err)
                }
                cs.Cpu = cpuStats

                memoryStats, err := c.memoryContainerStats(meta.ID, data, protobuf.FromTimestamp(stats.Timestamp))
                if err != nil {
                        return nil, fmt.Errorf("failed to obtain memory stats: %w", err)
                }
                cs.Memory = memoryStats
        }

        return &cs, nil
}

// getWorkingSet calculates workingset memory from cgroup memory stats.
// The caller should make sure memory is not nil.
// workingset = usage - total_inactive_file
func getWorkingSet(memory *cg1.MemoryStat) uint64 {
        if memory.Usage == nil {
                return 0
        }
        var workingSet uint64
        if memory.TotalInactiveFile < memory.Usage.Usage {
                workingSet = memory.Usage.Usage - memory.TotalInactiveFile
        }
        return workingSet
}

// getWorkingSetV2 calculates workingset memory from cgroupv2 memory stats.
// The caller should make sure memory is not nil.
// workingset = usage - inactive_file
func getWorkingSetV2(memory *cg2.MemoryStat) uint64 {
        var workingSet uint64
        if memory.InactiveFile < memory.Usage {
                workingSet = memory.Usage - memory.InactiveFile
        }
        return workingSet
}

func isMemoryUnlimited(v uint64) bool {
        // Size after which we consider memory to be "unlimited". This is not
        // MaxInt64 due to rounding by the kernel.
        // TODO: k8s or cadvisor should export this https://github.com/google/cadvisor/blob/2b6fbacac7598e0140b5bc8428e3bdd7d86cf5b9/metrics/prometheus.go#L1969-L1971
        const maxMemorySize = uint64(1 << 62)

        return v > maxMemorySize
}

// https://github.com/kubernetes/kubernetes/blob/b47f8263e18c7b13dba33fba23187e5e0477cdbd/pkg/kubelet/stats/helper.go#L68-L71
func getAvailableBytes(memory *cg1.MemoryStat, workingSetBytes uint64) uint64 {
        // memory limit - working set bytes
        if !isMemoryUnlimited(memory.Usage.Limit) {
                return memory.Usage.Limit - workingSetBytes
        }
        return 0
}

func getAvailableBytesV2(memory *cg2.MemoryStat, workingSetBytes uint64) uint64 {
        // memory limit (memory.max) for cgroupv2 - working set bytes
        if !isMemoryUnlimited(memory.UsageLimit) {
                return memory.UsageLimit - workingSetBytes
        }
        return 0
}

func (c *criService) cpuContainerStats(ID string, isSandbox bool, stats interface{}, timestamp time.Time) (*runtime.CpuUsage, error) {
        switch metrics := stats.(type) {
        case *cg1.Metrics:
                metrics.GetCPU().GetUsage()
                if metrics.CPU != nil && metrics.CPU.Usage != nil {
                        return &runtime.CpuUsage{
                                Timestamp:            timestamp.UnixNano(),
                                UsageCoreNanoSeconds: &runtime.UInt64Value{Value: metrics.CPU.Usage.Total},
                        }, nil
                }
        case *cg2.Metrics:
                if metrics.CPU != nil {
                        // convert to nano seconds
                        usageCoreNanoSeconds := metrics.CPU.UsageUsec * 1000

                        return &runtime.CpuUsage{
                                Timestamp:            timestamp.UnixNano(),
                                UsageCoreNanoSeconds: &runtime.UInt64Value{Value: usageCoreNanoSeconds},
                        }, nil
                }
        default:
                return nil, fmt.Errorf("unexpected metrics type: %T from %s", metrics, reflect.TypeOf(metrics).Elem().PkgPath())
        }
        return nil, nil
}

func (c *criService) memoryContainerStats(ID string, stats interface{}, timestamp time.Time) (*runtime.MemoryUsage, error) {
        switch metrics := stats.(type) {
        case *cg1.Metrics:
                if metrics.Memory != nil && metrics.Memory.Usage != nil {
                        workingSetBytes := getWorkingSet(metrics.Memory)

                        return &runtime.MemoryUsage{
                                Timestamp: timestamp.UnixNano(),
                                WorkingSetBytes: &runtime.UInt64Value{
                                        Value: workingSetBytes,
                                },
                                AvailableBytes:  &runtime.UInt64Value{Value: getAvailableBytes(metrics.Memory, workingSetBytes)},
                                UsageBytes:      &runtime.UInt64Value{Value: metrics.Memory.Usage.Usage},
                                RssBytes:        &runtime.UInt64Value{Value: metrics.Memory.TotalRSS},
                                PageFaults:      &runtime.UInt64Value{Value: metrics.Memory.TotalPgFault},
                                MajorPageFaults: &runtime.UInt64Value{Value: metrics.Memory.TotalPgMajFault},
                        }, nil
                }
        case *cg2.Metrics:
                if metrics.Memory != nil {
                        workingSetBytes := getWorkingSetV2(metrics.Memory)

                        return &runtime.MemoryUsage{
                                Timestamp: timestamp.UnixNano(),
                                WorkingSetBytes: &runtime.UInt64Value{
                                        Value: workingSetBytes,
                                },
                                AvailableBytes: &runtime.UInt64Value{Value: getAvailableBytesV2(metrics.Memory, workingSetBytes)},
                                UsageBytes:     &runtime.UInt64Value{Value: metrics.Memory.Usage},
                                // Use Anon memory for RSS as cAdvisor on cgroupv2
                                // see https://github.com/google/cadvisor/blob/a9858972e75642c2b1914c8d5428e33e6392c08a/container/libcontainer/handler.go#L799
                                RssBytes:        &runtime.UInt64Value{Value: metrics.Memory.Anon},
                                PageFaults:      &runtime.UInt64Value{Value: metrics.Memory.Pgfault},
                                MajorPageFaults: &runtime.UInt64Value{Value: metrics.Memory.Pgmajfault},
                        }, nil
                }
        default:
                return nil, fmt.Errorf("unexpected metrics type: %T from %s", metrics, reflect.TypeOf(metrics).Elem().PkgPath())
        }
        return nil, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "encoding/json"
        "fmt"

        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/errdefs"

        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ContainerStatus inspects the container and returns the status.
func (c *criService) ContainerStatus(ctx context.Context, r *runtime.ContainerStatusRequest) (*runtime.ContainerStatusResponse, error) {
        container, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find container %q: %w", r.GetContainerId(), err)
        }

        // TODO(random-liu): Clean up the following logic in CRI.
        // Current assumption:
        // * ImageSpec in container config is image ID.
        // * ImageSpec in container status is image tag.
        // * ImageRef in container status is repo digest.
        spec := container.Config.GetImage()
        imageRef := container.ImageRef
        image, err := c.GetImage(imageRef)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("failed to get image %q: %w", imageRef, err)
                }
        } else {
                repoTags, repoDigests := util.ParseImageReferences(image.References)
                if len(repoTags) > 0 {
                        // Based on current behavior of dockershim, this field should be
                        // image tag.
                        spec = &runtime.ImageSpec{Image: repoTags[0]}
                }
                if len(repoDigests) > 0 {
                        // Based on the CRI definition, this field will be consumed by user.
                        imageRef = repoDigests[0]
                }
        }
        status := toCRIContainerStatus(container, spec, imageRef)
        if status.GetCreatedAt() == 0 {
                // CRI doesn't allow CreatedAt == 0.
                info, err := container.Container.Info(ctx)
                if err != nil {
                        return nil, fmt.Errorf("failed to get CreatedAt in %q state: %w", status.State, err)
                }
                status.CreatedAt = info.CreatedAt.UnixNano()
        }

        info, err := toCRIContainerInfo(ctx, container, r.GetVerbose())
        if err != nil {
                return nil, fmt.Errorf("failed to get verbose container info: %w", err)
        }

        return &runtime.ContainerStatusResponse{
                Status: status,
                Info:   info,
        }, nil
}

// toCRIContainerStatus converts internal container object to CRI container status.
func toCRIContainerStatus(container containerstore.Container, spec *runtime.ImageSpec, imageRef string) *runtime.ContainerStatus {
        meta := container.Metadata
        status := container.Status.Get()
        reason := status.Reason
        if status.State() == runtime.ContainerState_CONTAINER_EXITED && reason == "" {
                if status.ExitCode == 0 {
                        reason = completeExitReason
                } else {
                        reason = errorExitReason
                }
        }

        // If container is in the created state, not set started and finished unix timestamps
        var st, ft int64
        switch status.State() {
        case runtime.ContainerState_CONTAINER_RUNNING:
                // If container is in the running state, set started unix timestamps
                st = status.StartedAt
        case runtime.ContainerState_CONTAINER_EXITED, runtime.ContainerState_CONTAINER_UNKNOWN:
                st, ft = status.StartedAt, status.FinishedAt
        }

        return &runtime.ContainerStatus{
                Id:          meta.ID,
                Metadata:    meta.Config.GetMetadata(),
                State:       status.State(),
                CreatedAt:   status.CreatedAt,
                StartedAt:   st,
                FinishedAt:  ft,
                ExitCode:    status.ExitCode,
                Image:       spec,
                ImageRef:    imageRef,
                Reason:      reason,
                Message:     status.Message,
                Labels:      meta.Config.GetLabels(),
                Annotations: meta.Config.GetAnnotations(),
                Mounts:      meta.Config.GetMounts(),
                LogPath:     meta.LogPath,
                Resources:   status.Resources,
        }
}

// ContainerInfo is extra information for a container.
type ContainerInfo struct {
        // TODO(random-liu): Add sandboxID in CRI container status.
        SandboxID      string                   `json:"sandboxID"`
        Pid            uint32                   `json:"pid"`
        Removing       bool                     `json:"removing"`
        SnapshotKey    string                   `json:"snapshotKey"`
        Snapshotter    string                   `json:"snapshotter"`
        RuntimeType    string                   `json:"runtimeType"`
        RuntimeOptions interface{}              `json:"runtimeOptions"`
        Config         *runtime.ContainerConfig `json:"config"`
        RuntimeSpec    *runtimespec.Spec        `json:"runtimeSpec"`
}

// toCRIContainerInfo converts internal container object information to CRI container status response info map.
func toCRIContainerInfo(ctx context.Context, container containerstore.Container, verbose bool) (map[string]string, error) {
        if !verbose {
                return nil, nil
        }

        meta := container.Metadata
        status := container.Status.Get()

        // TODO(random-liu): Change CRI status info to use array instead of map.
        ci := &ContainerInfo{
                SandboxID: container.SandboxID,
                Pid:       status.Pid,
                Removing:  status.Removing,
                Config:    meta.Config,
        }

        var err error
        ci.RuntimeSpec, err = container.Container.Spec(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to get container runtime spec: %w", err)
        }

        ctrInfo, err := container.Container.Info(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to get container info: %w", err)
        }
        ci.SnapshotKey = ctrInfo.SnapshotKey
        ci.Snapshotter = ctrInfo.Snapshotter

        runtimeOptions, err := getRuntimeOptions(ctrInfo)
        if err != nil {
                return nil, fmt.Errorf("failed to get runtime options: %w", err)
        }
        ci.RuntimeType = ctrInfo.Runtime.Name
        ci.RuntimeOptions = runtimeOptions

        infoBytes, err := json.Marshal(ci)
        if err != nil {
                return nil, fmt.Errorf("failed to marshal info %v: %w", ci, err)
        }
        return map[string]string{
                "info": string(infoBytes),
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "sync/atomic"
        "syscall"
        "time"

        eventtypes "github.com/containerd/containerd/api/events"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        "github.com/moby/sys/signal"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// StopContainer stops a running container with a grace period (i.e., timeout).
func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) {
        start := time.Now()
        // Get container config from container store.
        container, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find container %q: %w", r.GetContainerId(), err)
        }

        if err := c.stopContainer(ctx, container, time.Duration(r.GetTimeout())*time.Second); err != nil {
                return nil, err
        }

        sandbox, err := c.sandboxStore.Get(container.SandboxID)
        if err != nil {
                err = c.nri.StopContainer(ctx, nil, &container)
        } else {
                err = c.nri.StopContainer(ctx, &sandbox, &container)
        }
        if err != nil {
                log.G(ctx).WithError(err).Error("NRI failed to stop container")
        }

        i, err := container.Container.Info(ctx)
        if err != nil {
                return nil, fmt.Errorf("get container info: %w", err)
        }

        containerStopTimer.WithValues(i.Runtime.Name).UpdateSince(start)

        return &runtime.StopContainerResponse{}, nil
}

// stopContainer stops a container based on the container metadata.
func (c *criService) stopContainer(ctx context.Context, container containerstore.Container, timeout time.Duration) error {
        id := container.ID
        sandboxID := container.SandboxID

        // Return without error if container is not running. This makes sure that
        // stop only takes real action after the container is started.
        state := container.Status.Get().State()
        if state != runtime.ContainerState_CONTAINER_RUNNING &&
                state != runtime.ContainerState_CONTAINER_UNKNOWN {
                log.G(ctx).Infof("Container to stop %q must be in running or unknown state, current state %q",
                        id, criContainerStateToString(state))
                return nil
        }

        task, err := container.Container.Task(ctx, nil)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to get task for container %q: %w", id, err)
                }
                // Don't return for unknown state, some cleanup needs to be done.
                if state == runtime.ContainerState_CONTAINER_UNKNOWN {
                        return c.cleanupUnknownContainer(ctx, id, container, sandboxID)
                }
                return nil
        }

        // Handle unknown state.
        if state == runtime.ContainerState_CONTAINER_UNKNOWN {
                // Start an exit handler for containers in unknown state.
                waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
                defer waitCancel()
                exitCh, err := task.Wait(waitCtx)
                if err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to wait for task for %q: %w", id, err)
                        }
                        return c.cleanupUnknownContainer(ctx, id, container, sandboxID)
                }

                exitCtx, exitCancel := context.WithCancel(context.Background())
                stopCh := c.startContainerExitMonitor(exitCtx, id, task.Pid(), exitCh)
                defer func() {
                        exitCancel()
                        // This ensures that exit monitor is stopped before
                        // `Wait` is cancelled, so no exit event is generated
                        // because of the `Wait` cancellation.
                        <-stopCh
                }()
        }

        // We only need to kill the task. The event handler will Delete the
        // task from containerd after it handles the Exited event.
        if timeout > 0 {
                stopSignal := "SIGTERM"
                if container.StopSignal != "" {
                        stopSignal = container.StopSignal
                } else {
                        // The image may have been deleted, and the `StopSignal` field is
                        // just introduced to handle that.
                        // However, for containers created before the `StopSignal` field is
                        // introduced, still try to get the stop signal from the image config.
                        // If the image has been deleted, logging an error and using the
                        // default SIGTERM is still better than returning error and leaving
                        // the container unstoppable. (See issue #990)
                        // TODO(random-liu): Remove this logic when containerd 1.2 is deprecated.
                        image, err := c.GetImage(container.ImageRef)
                        if err != nil {
                                if !errdefs.IsNotFound(err) {
                                        return fmt.Errorf("failed to get image %q: %w", container.ImageRef, err)
                                }
                                log.G(ctx).Warningf("Image %q not found, stop container with signal %q", container.ImageRef, stopSignal)
                        } else {
                                if image.ImageSpec.Config.StopSignal != "" {
                                        stopSignal = image.ImageSpec.Config.StopSignal
                                }
                        }
                }
                sig, err := signal.ParseSignal(stopSignal)
                if err != nil {
                        return fmt.Errorf("failed to parse stop signal %q: %w", stopSignal, err)
                }

                var sswt bool
                if container.IsStopSignaledWithTimeout == nil {
                        log.G(ctx).Infof("unable to ensure stop signal %v was not sent twice to container %v", sig, id)
                        sswt = true
                } else {
                        sswt = atomic.CompareAndSwapUint32(container.IsStopSignaledWithTimeout, 0, 1)
                }

                if sswt {
                        log.G(ctx).Infof("Stop container %q with signal %v", id, sig)
                        if err = task.Kill(ctx, sig); err != nil && !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to stop container %q: %w", id, err)
                        }
                } else {
                        log.G(ctx).Infof("Skipping the sending of signal %v to container %q because a prior stop with timeout>0 request already sent the signal", sig, id)
                }

                sigTermCtx, sigTermCtxCancel := context.WithTimeout(ctx, timeout)
                defer sigTermCtxCancel()
                err = c.waitContainerStop(sigTermCtx, container)
                if err == nil {
                        // Container stopped on first signal no need for SIGKILL
                        return nil
                }
                // If the parent context was cancelled or exceeded return immediately
                if ctx.Err() != nil {
                        return ctx.Err()
                }
                // sigTermCtx was exceeded. Send SIGKILL
                log.G(ctx).Debugf("Stop container %q with signal %v timed out", id, sig)
        }

        log.G(ctx).Infof("Kill container %q", id)
        if err = task.Kill(ctx, syscall.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
                return fmt.Errorf("failed to kill container %q: %w", id, err)
        }

        // Wait for a fixed timeout until container stop is observed by event monitor.
        err = c.waitContainerStop(ctx, container)
        if err != nil {
                return fmt.Errorf("an error occurs during waiting for container %q to be killed: %w", id, err)
        }
        return nil
}

// waitContainerStop waits for container to be stopped until context is
// cancelled or the context deadline is exceeded.
func (c *criService) waitContainerStop(ctx context.Context, container containerstore.Container) error {
        select {
        case <-ctx.Done():
                return fmt.Errorf("wait container %q: %w", container.ID, ctx.Err())
        case <-container.Stopped():
                return nil
        }
}

// cleanupUnknownContainer cleanup stopped container in unknown state.
func (c *criService) cleanupUnknownContainer(ctx context.Context, id string, cntr containerstore.Container, sandboxID string) error {
        // Reuse handleContainerExit to do the cleanup.
        return c.handleContainerExit(ctx, &eventtypes.TaskExit{
                ContainerID: id,
                ID:          id,
                Pid:         0,
                ExitStatus:  unknownExitCode,
                ExitedAt:    protobuf.ToTimestamp(time.Now()),
        }, cntr, sandboxID)
}

//go:build !darwin && !freebsd

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        gocontext "context"
        "fmt"

        "github.com/containerd/typeurl/v2"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
)

// UpdateContainerResources updates ContainerConfig of the container.
func (c *criService) UpdateContainerResources(ctx context.Context, r *runtime.UpdateContainerResourcesRequest) (retRes *runtime.UpdateContainerResourcesResponse, retErr error) {
        container, err := c.containerStore.Get(r.GetContainerId())
        if err != nil {
                return nil, fmt.Errorf("failed to find container: %w", err)
        }

        sandbox, err := c.sandboxStore.Get(container.SandboxID)
        if err != nil {
                return nil, err
        }

        resources := r.GetLinux()
        updated, err := c.nri.UpdateContainerResources(ctx, &sandbox, &container, resources)
        if err != nil {
                return nil, fmt.Errorf("NRI container update failed: %w", err)
        }
        if updated != nil {
                *resources = *updated
        }

        // Update resources in status update transaction, so that:
        // 1) There won't be race condition with container start.
        // 2) There won't be concurrent resource update to the same container.
        if err := container.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
                return c.updateContainerResources(ctx, container, r, status)
        }); err != nil {
                return nil, fmt.Errorf("failed to update resources: %w", err)
        }

        err = c.nri.PostUpdateContainerResources(ctx, &sandbox, &container)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("NRI post-update notification failed")
        }

        return &runtime.UpdateContainerResourcesResponse{}, nil
}

func (c *criService) updateContainerResources(ctx context.Context,
        cntr containerstore.Container,
        r *runtime.UpdateContainerResourcesRequest,
        status containerstore.Status) (newStatus containerstore.Status, retErr error) {

        newStatus = status
        id := cntr.ID
        // Do not update the container when there is a removal in progress.
        if status.Removing {
                return newStatus, fmt.Errorf("container %q is in removing state", id)
        }

        // Update container spec. If the container is not started yet, updating
        // spec makes sure that the resource limits are correct when start;
        // if the container is already started, updating spec is still required,
        // the spec will become our source of truth for resource limits.
        oldSpec, err := cntr.Container.Spec(ctx)
        if err != nil {
                return newStatus, fmt.Errorf("failed to get container spec: %w", err)
        }
        newSpec, err := updateOCIResource(ctx, oldSpec, r, c.config)
        if err != nil {
                return newStatus, fmt.Errorf("failed to update resource in spec: %w", err)
        }

        if err := updateContainerSpec(ctx, cntr.Container, newSpec); err != nil {
                return newStatus, err
        }
        defer func() {
                if retErr != nil {
                        deferCtx, deferCancel := ctrdutil.DeferContext()
                        defer deferCancel()
                        // Reset spec on error.
                        if err := updateContainerSpec(deferCtx, cntr.Container, oldSpec); err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to update spec %+v for container %q", oldSpec, id)
                        }
                } else {
                        // Update container status only when the spec is updated
                        newStatus = copyResourcesToStatus(newSpec, status)
                }
        }()

        // If container is not running, only update spec is enough, new resource
        // limit will be applied when container start.
        if status.State() != runtime.ContainerState_CONTAINER_RUNNING {
                return newStatus, nil
        }

        task, err := cntr.Container.Task(ctx, nil)
        if err != nil {
                if errdefs.IsNotFound(err) {
                        // Task exited already.
                        return newStatus, nil
                }
                return newStatus, fmt.Errorf("failed to get task: %w", err)
        }
        // newSpec.Linux / newSpec.Windows won't be nil
        if err := task.Update(ctx, containerd.WithResources(getResources(newSpec))); err != nil {
                if errdefs.IsNotFound(err) {
                        // Task exited already.
                        return newStatus, nil
                }
                return newStatus, fmt.Errorf("failed to update resources: %w", err)
        }
        return newStatus, nil
}

// updateContainerSpec updates container spec.
func updateContainerSpec(ctx context.Context, cntr containerd.Container, spec *runtimespec.Spec) error {
        s, err := typeurl.MarshalAny(spec)
        if err != nil {
                return fmt.Errorf("failed to marshal spec %+v: %w", spec, err)
        }
        if err := cntr.Update(ctx, func(ctx gocontext.Context, client *containerd.Client, c *containers.Container) error {
                c.Spec = s
                return nil
        }); err != nil {
                return fmt.Errorf("failed to update container spec: %w", err)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"

        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/opts"
        "github.com/containerd/containerd/v2/internal/cri/util"
)

// updateOCIResource updates container resource limit.
func updateOCIResource(ctx context.Context, spec *runtimespec.Spec, r *runtime.UpdateContainerResourcesRequest,
        config criconfig.Config) (*runtimespec.Spec, error) {

        // Copy to make sure old spec is not changed.
        var cloned runtimespec.Spec
        if err := util.DeepCopy(&cloned, spec); err != nil {
                return nil, fmt.Errorf("failed to deep copy: %w", err)
        }
        if cloned.Linux == nil {
                cloned.Linux = &runtimespec.Linux{}
        }
        if err := opts.WithResources(r.GetLinux(), config.TolerateMissingHugetlbController, config.DisableHugetlbController)(ctx, nil, nil, &cloned); err != nil {
                return nil, fmt.Errorf("unable to set linux container resources: %w", err)
        }
        return &cloned, nil
}

func getResources(spec *runtimespec.Spec) interface{} {
        return spec.Linux.Resources
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        eventtypes "github.com/containerd/containerd/api/events"
        apitasks "github.com/containerd/containerd/api/services/tasks/v1"
        containerd "github.com/containerd/containerd/v2/client"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        containerdio "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/containerd/v2/pkg/protobuf"
)

const (
        // handleEventTimeout is the timeout for handling 1 event. Event monitor
        // handles events in serial, if one event blocks the event monitor, no
        // other events can be handled.
        // Add a timeout for each event handling, events that timeout will be requeued and
        // handled again in the future.
        handleEventTimeout = 10 * time.Second
)

// startSandboxExitMonitor starts an exit monitor for a given sandbox.
func (c *criService) startSandboxExitMonitor(ctx context.Context, id string, exitCh <-chan containerd.ExitStatus) <-chan struct{} {
        stopCh := make(chan struct{})
        go func() {
                defer close(stopCh)
                select {
                case exitRes := <-exitCh:
                        exitStatus, exitedAt, err := exitRes.Result()
                        if err != nil {
                                log.L.WithError(err).Errorf("failed to get task exit status for %q", id)
                                exitStatus = unknownExitCode
                                exitedAt = time.Now()
                        }

                        e := &eventtypes.SandboxExit{
                                SandboxID:  id,
                                ExitStatus: exitStatus,
                                ExitedAt:   protobuf.ToTimestamp(exitedAt),
                        }

                        log.L.Infof("received exit event %+v", e)

                        err = func() error {
                                dctx := ctrdutil.NamespacedContext()
                                dctx, dcancel := context.WithTimeout(dctx, handleEventTimeout)
                                defer dcancel()

                                sb, err := c.sandboxStore.Get(id)
                                if err == nil {
                                        if err := c.handleSandboxExit(dctx, sb, exitStatus, exitedAt); err != nil {
                                                return err
                                        }
                                        return nil
                                } else if !errdefs.IsNotFound(err) {
                                        return fmt.Errorf("failed to get sandbox %s: %w", e.SandboxID, err)
                                }
                                return nil
                        }()
                        if err != nil {
                                log.L.WithError(err).Errorf("failed to handle sandbox TaskExit event %+v", e)
                                c.eventMonitor.Backoff(id, e)
                        }
                        return
                case <-ctx.Done():
                }
        }()
        return stopCh
}

// handleSandboxExit handles sandbox exit event.
func (c *criService) handleSandboxExit(ctx context.Context, sb sandboxstore.Sandbox, exitStatus uint32, exitTime time.Time) error {
        if err := sb.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
                status.State = sandboxstore.StateNotReady
                status.Pid = 0
                status.ExitStatus = exitStatus
                status.ExitedAt = exitTime
                return status, nil
        }); err != nil {
                return fmt.Errorf("failed to update sandbox state: %w", err)
        }

        // Using channel to propagate the information of sandbox stop
        sb.Stop()
        c.generateAndSendContainerEvent(ctx, sb.ID, sb.ID, runtime.ContainerEventType_CONTAINER_STOPPED_EVENT)
        return nil
}

// startContainerExitMonitor starts an exit monitor for a given container.
func (c *criService) startContainerExitMonitor(ctx context.Context, id string, pid uint32, exitCh <-chan containerd.ExitStatus) <-chan struct{} {
        stopCh := make(chan struct{})
        go func() {
                defer close(stopCh)
                select {
                case exitRes := <-exitCh:
                        exitStatus, exitedAt, err := exitRes.Result()
                        if err != nil {
                                log.L.WithError(err).Errorf("failed to get task exit status for %q", id)
                                exitStatus = unknownExitCode
                                exitedAt = time.Now()
                        }

                        e := &eventtypes.TaskExit{
                                ContainerID: id,
                                ID:          id,
                                Pid:         pid,
                                ExitStatus:  exitStatus,
                                ExitedAt:    protobuf.ToTimestamp(exitedAt),
                        }

                        log.L.Infof("received exit event %+v", e)

                        err = func() error {
                                dctx := ctrdutil.NamespacedContext()
                                dctx, dcancel := context.WithTimeout(dctx, handleEventTimeout)
                                defer dcancel()

                                cntr, err := c.containerStore.Get(e.ID)
                                if err == nil {
                                        if err := c.handleContainerExit(dctx, e, cntr, cntr.SandboxID); err != nil {
                                                return err
                                        }
                                        return nil
                                } else if !errdefs.IsNotFound(err) {
                                        return fmt.Errorf("failed to get container %s: %w", e.ID, err)
                                }
                                return nil
                        }()
                        if err != nil {
                                log.L.WithError(err).Errorf("failed to handle container TaskExit event %+v", e)
                                c.eventMonitor.Backoff(id, e)
                        }
                        return
                case <-ctx.Done():
                }
        }()
        return stopCh
}

// handleContainerExit handles TaskExit event for container.
func (c *criService) handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr containerstore.Container, sandboxID string) error {
        // Attach container IO so that `Delete` could cleanup the stream properly.
        task, err := cntr.Container.Task(ctx,
                func(*containerdio.FIFOSet) (containerdio.IO, error) {
                        // We can't directly return cntr.IO here, because
                        // even if cntr.IO is nil, the cio.IO interface
                        // is not.
                        // See https://tour.golang.org/methods/12:
                        //   Note that an interface value that holds a nil
                        //   concrete value is itself non-nil.
                        if cntr.IO != nil {
                                return cntr.IO, nil
                        }
                        return nil, nil
                },
        )
        if err != nil {
                if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) {
                        return fmt.Errorf("failed to load task for container: %w", err)
                }
        } else {
                // TODO(random-liu): [P1] This may block the loop, we may want to spawn a worker
                if _, err = task.Delete(ctx, c.nri.WithContainerExit(&cntr), containerd.WithProcessKill); err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to stop container: %w", err)
                        }
                        // Move on to make sure container status is updated.
                }
        }

        // NOTE: Both sb.Container.Task and task.Delete interface always ensures
        // that the status of target task. However, the interfaces return
        // ErrNotFound, which doesn't mean that the shim instance doesn't exist.
        //
        // There are two caches for task in containerd:
        //
        //   1. io.containerd.service.v1.tasks-service
        //   2. io.containerd.runtime.v2.task
        //
        // First one is to maintain the shim connection and shutdown the shim
        // in Delete API. And the second one is to maintain the lifecycle of
        // task in shim server.
        //
        // So, if the shim instance is running and task has been deleted in shim
        // server, the sb.Container.Task and task.Delete will receive the
        // ErrNotFound. If we don't delete the shim instance in io.containerd.service.v1.tasks-service,
        // shim will be leaky.
        //
        // Based on containerd/containerd/v2#7496 issue, when host is under IO
        // pressure, the umount2 syscall will take more than 10 seconds so that
        // the CRI plugin will cancel this task.Delete call. However, the shim
        // server isn't aware about this. After return from umount2 syscall, the
        // shim server continue delete the task record. And then CRI plugin
        // retries to delete task and retrieves ErrNotFound and marks it as
        // stopped. Therefore, The shim is leaky.
        //
        // It's hard to handle the connection lost or request canceled cases in
        // shim server. We should call Delete API to io.containerd.service.v1.tasks-service
        // to ensure that shim instance is shutdown.
        //
        // REF:
        // 1. https://github.com/containerd/containerd/issues/7496#issuecomment-1671100968
        // 2. https://github.com/containerd/containerd/issues/8931
        if errdefs.IsNotFound(err) {
                _, err = c.client.TaskService().Delete(ctx, &apitasks.DeleteTaskRequest{ContainerID: cntr.Container.ID()})
                if err != nil {
                        err = errdefs.FromGRPC(err)
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to cleanup container %s in task-service: %w", cntr.Container.ID(), err)
                        }
                }
                log.L.Infof("Ensure that container %s in task-service has been cleanup successfully", cntr.Container.ID())
        }

        err = cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
                if status.FinishedAt == 0 {
                        status.Pid = 0
                        status.FinishedAt = protobuf.FromTimestamp(e.ExitedAt).UnixNano()
                        status.ExitCode = int32(e.ExitStatus)
                }

                // Unknown state can only transit to EXITED state, so we need
                // to handle unknown state here.
                if status.Unknown {
                        log.L.Debugf("Container %q transited from UNKNOWN to EXITED", cntr.ID)
                        status.Unknown = false
                }
                return status, nil
        })
        if err != nil {
                return fmt.Errorf("failed to update container state: %w", err)
        }
        // Using channel to propagate the information of container stop
        cntr.Stop()
        c.generateAndSendContainerEvent(ctx, cntr.ID, sandboxID, runtime.ContainerEventType_CONTAINER_STOPPED_EVENT)
        return nil
}

type criEventHandler struct {
        c *criService
}

// HandleEvent handles a containerd event.
func (ce *criEventHandler) HandleEvent(any interface{}) error {
        ctx := ctrdutil.NamespacedContext()
        ctx, cancel := context.WithTimeout(ctx, handleEventTimeout)
        defer cancel()

        switch e := any.(type) {
        case *eventtypes.TaskExit:
                log.L.Infof("TaskExit event %+v", e)
                // Use ID instead of ContainerID to rule out TaskExit event for exec.
                cntr, err := ce.c.containerStore.Get(e.ID)
                if err == nil {
                        if err := ce.c.handleContainerExit(ctx, e, cntr, cntr.SandboxID); err != nil {
                                return fmt.Errorf("failed to handle container TaskExit event: %w", err)
                        }
                        return nil
                } else if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("can't find container for TaskExit event: %w", err)
                }
                sb, err := ce.c.sandboxStore.Get(e.ID)
                if err == nil {
                        if err := ce.c.handleSandboxExit(ctx, sb, e.ExitStatus, e.ExitedAt.AsTime()); err != nil {
                                return fmt.Errorf("failed to handle sandbox TaskExit event: %w", err)
                        }
                        return nil
                } else if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("can't find sandbox for TaskExit event: %w", err)
                }
                return nil
        case *eventtypes.SandboxExit:
                log.L.Infof("SandboxExit event %+v", e)
                sb, err := ce.c.sandboxStore.Get(e.GetSandboxID())
                if err == nil {
                        if err := ce.c.handleSandboxExit(ctx, sb, e.ExitStatus, e.ExitedAt.AsTime()); err != nil {
                                return fmt.Errorf("failed to handle sandbox TaskExit event: %w", err)
                        }
                        return nil
                } else if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("can't find sandbox for TaskExit event: %w", err)
                }
                return nil
        case *eventtypes.TaskOOM:
                log.L.Infof("TaskOOM event %+v", e)
                // For TaskOOM, we only care which container it belongs to.
                cntr, err := ce.c.containerStore.Get(e.ContainerID)
                if err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("can't find container for TaskOOM event: %w", err)
                        }
                        return nil
                }
                err = cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
                        status.Reason = oomExitReason
                        return status, nil
                })
                if err != nil {
                        return fmt.Errorf("failed to update container status for TaskOOM event: %w", err)
                }
        case *eventtypes.ImageCreate:
                log.L.Infof("ImageCreate event %+v", e)
                return ce.c.UpdateImage(ctx, e.Name)
        case *eventtypes.ImageUpdate:
                log.L.Infof("ImageUpdate event %+v", e)
                return ce.c.UpdateImage(ctx, e.Name)
        case *eventtypes.ImageDelete:
                log.L.Infof("ImageDelete event %+v", e)
                return ce.c.UpdateImage(ctx, e.Name)
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package events

import (
        "context"
        "errors"
        "fmt"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        "k8s.io/utils/clock"

        eventtypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/internal/cri/constants"
)

const (
        backOffInitDuration        = 1 * time.Second
        backOffMaxDuration         = 5 * time.Minute
        backOffExpireCheckDuration = 1 * time.Second
)

type EventHandler interface {
        HandleEvent(any interface{}) error
}

// EventMonitor monitors containerd event and updates internal state correspondingly.
type EventMonitor struct {
        ch           <-chan *events.Envelope
        errCh        <-chan error
        ctx          context.Context
        cancel       context.CancelFunc
        backOff      *backOff
        eventHandler EventHandler
}

type backOff struct {
        // queuePoolMu is mutex used to protect the queuePool map
        queuePoolMu sync.Mutex

        queuePool map[string]*backOffQueue
        // tickerMu is mutex used to protect the ticker.
        tickerMu      sync.Mutex
        ticker        *time.Ticker
        minDuration   time.Duration
        maxDuration   time.Duration
        checkDuration time.Duration
        clock         clock.Clock
}

type backOffQueue struct {
        events     []interface{}
        expireTime time.Time
        duration   time.Duration
        clock      clock.Clock
}

// NewEventMonitor create new event monitor. New event monitor will Start subscribing containerd event. All events
// happen after it should be monitored.
func NewEventMonitor(eventHandler EventHandler) *EventMonitor {
        ctx, cancel := context.WithCancel(context.Background())
        return &EventMonitor{
                ctx:          ctx,
                cancel:       cancel,
                backOff:      newBackOff(),
                eventHandler: eventHandler,
        }
}

// Subscribe starts to Subscribe containerd events.
func (em *EventMonitor) Subscribe(subscriber events.Subscriber, filters []string) {
        em.ch, em.errCh = subscriber.Subscribe(em.ctx, filters...)
}

func convertEvent(e typeurl.Any) (string, interface{}, error) {
        id := ""
        evt, err := typeurl.UnmarshalAny(e)
        if err != nil {
                return "", nil, fmt.Errorf("failed to unmarshalany: %w", err)
        }

        switch e := evt.(type) {
        case *eventtypes.TaskOOM:
                id = e.ContainerID
        case *eventtypes.SandboxExit:
                id = e.SandboxID
        case *eventtypes.ImageCreate:
                id = e.Name
        case *eventtypes.ImageUpdate:
                id = e.Name
        case *eventtypes.ImageDelete:
                id = e.Name
        case *eventtypes.TaskExit:
                id = e.ContainerID
        default:
                return "", nil, errors.New("unsupported event")
        }
        return id, evt, nil
}

// Start starts the event monitor which monitors and handles all subscribed events.
// It returns an error channel for the caller to wait for Stop errors from the
// event monitor.
//
// NOTE:
//  1. Start must be called after Subscribe.
//  2. The task exit event has been handled in individual startSandboxExitMonitor
//     or startContainerExitMonitor goroutine at the first. If the goroutine fails,
//     it puts the event into backoff retry queue and event monitor will handle
//     it later.
func (em *EventMonitor) Start() <-chan error {
        errCh := make(chan error)
        if em.ch == nil || em.errCh == nil {
                panic("event channel is nil")
        }
        backOffCheckCh := em.backOff.start()
        go func() {
                defer close(errCh)
                for {
                        select {
                        case e := <-em.ch:
                                log.L.Debugf("Received containerd event timestamp - %v, namespace - %q, topic - %q", e.Timestamp, e.Namespace, e.Topic)
                                if e.Namespace != constants.K8sContainerdNamespace {
                                        log.L.Debugf("Ignoring events in namespace - %q", e.Namespace)
                                        break
                                }
                                id, evt, err := convertEvent(e.Event)
                                if err != nil {
                                        log.L.WithError(err).Errorf("Failed to convert event %+v", e)
                                        break
                                }
                                if em.backOff.isInBackOff(id) {
                                        log.L.Infof("Events for %q is in backoff, enqueue event %+v", id, evt)
                                        em.backOff.enBackOff(id, evt)
                                        break
                                }
                                if err := em.eventHandler.HandleEvent(evt); err != nil {
                                        log.L.WithError(err).Errorf("Failed to handle event %+v for %s", evt, id)
                                        em.backOff.enBackOff(id, evt)
                                }
                        case err := <-em.errCh:
                                // Close errCh in defer directly if there is no error.
                                if err != nil {
                                        log.L.WithError(err).Error("Failed to handle event stream")
                                        errCh <- err
                                }
                                return
                        case <-backOffCheckCh:
                                ids := em.backOff.getExpiredIDs()
                                for _, id := range ids {
                                        queue := em.backOff.deBackOff(id)
                                        for i, evt := range queue.events {
                                                if err := em.eventHandler.HandleEvent(evt); err != nil {
                                                        log.L.WithError(err).Errorf("Failed to handle backOff event %+v for %s", evt, id)
                                                        em.backOff.reBackOff(id, queue.events[i:], queue.duration)
                                                        break
                                                }
                                        }
                                }
                        }
                }
        }()
        return errCh
}

func (em *EventMonitor) Backoff(key string, evt interface{}) {
        em.backOff.enBackOff(key, evt)
}

// Stop stops the event monitor. It will close the event channel.
// Once event monitor is stopped, it can't be started.
func (em *EventMonitor) Stop() {
        em.backOff.stop()
        em.cancel()
}

func newBackOff() *backOff {
        return &backOff{
                queuePool:     map[string]*backOffQueue{},
                minDuration:   backOffInitDuration,
                maxDuration:   backOffMaxDuration,
                checkDuration: backOffExpireCheckDuration,
                clock:         clock.RealClock{},
        }
}

func (b *backOff) getExpiredIDs() []string {
        b.queuePoolMu.Lock()
        defer b.queuePoolMu.Unlock()

        var ids []string
        for id, q := range b.queuePool {
                if q.isExpire() {
                        ids = append(ids, id)
                }
        }
        return ids
}

func (b *backOff) isInBackOff(key string) bool {
        b.queuePoolMu.Lock()
        defer b.queuePoolMu.Unlock()

        if _, ok := b.queuePool[key]; ok {
                return true
        }
        return false
}

// enBackOff start to backOff and put event to the tail of queue
func (b *backOff) enBackOff(key string, evt interface{}) {
        b.queuePoolMu.Lock()
        defer b.queuePoolMu.Unlock()

        if queue, ok := b.queuePool[key]; ok {
                queue.events = append(queue.events, evt)
                return
        }
        b.queuePool[key] = newBackOffQueue([]interface{}{evt}, b.minDuration, b.clock)
}

// enBackOff get out the whole queue
func (b *backOff) deBackOff(key string) *backOffQueue {
        b.queuePoolMu.Lock()
        defer b.queuePoolMu.Unlock()

        queue := b.queuePool[key]
        delete(b.queuePool, key)
        return queue
}

// enBackOff start to backOff again and put events to the queue
func (b *backOff) reBackOff(key string, events []interface{}, oldDuration time.Duration) {
        b.queuePoolMu.Lock()
        defer b.queuePoolMu.Unlock()

        duration := 2 * oldDuration
        if duration > b.maxDuration {
                duration = b.maxDuration
        }
        b.queuePool[key] = newBackOffQueue(events, duration, b.clock)
}

func (b *backOff) start() <-chan time.Time {
        b.tickerMu.Lock()
        defer b.tickerMu.Unlock()
        b.ticker = time.NewTicker(b.checkDuration)
        return b.ticker.C
}

func (b *backOff) stop() {
        b.tickerMu.Lock()
        defer b.tickerMu.Unlock()
        if b.ticker != nil {
                b.ticker.Stop()
        }
}

func newBackOffQueue(events []interface{}, init time.Duration, c clock.Clock) *backOffQueue {
        return &backOffQueue{
                events:     events,
                duration:   init,
                expireTime: c.Now().Add(init),
                clock:      c,
        }
}

func (q *backOffQueue) isExpire() bool {
        // return time.Now >= expireTime
        return !q.clock.Now().Before(q.expireTime)
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "fmt"

        "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
)

func SandboxStore(cs CRIService) (*sandbox.Store, error) {
        s, ok := cs.(*criService)
        if !ok {
                return nil, fmt.Errorf("%+v is not sbserver.criService", cs)
        }
        return s.sandboxStore, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "path"
        "path/filepath"
        "regexp"
        goruntime "runtime"
        "strconv"
        "strings"
        "time"

        "github.com/containerd/typeurl/v2"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        clabels "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

// TODO: Move common helpers for sbserver and podsandbox to a dedicated package once basic services are functinal.

const (
        // errorStartReason is the exit reason when fails to start container.
        errorStartReason = "StartError"
        // errorStartExitCode is the exit code when fails to start container.
        // 128 is the same with Docker's behavior.
        // TODO(windows): Figure out what should be used for windows.
        errorStartExitCode = 128
        // completeExitReason is the exit reason when container exits with code 0.
        completeExitReason = "Completed"
        // errorExitReason is the exit reason when container exits with code non-zero.
        errorExitReason = "Error"
        // oomExitReason is the exit reason when process in container is oom killed.
        oomExitReason = "OOMKilled"

        // sandboxesDir contains all sandbox root. A sandbox root is the running
        // directory of the sandbox, all files created for the sandbox will be
        // placed under this directory.
        sandboxesDir = "sandboxes"
        // containersDir contains all container root.
        containersDir = "containers"
        // Delimiter used to construct container/sandbox names.
        nameDelimiter = "_"

        // defaultIfName is the default network interface for the pods
        defaultIfName = "eth0"

        // devShm is the default path of /dev/shm.
        devShm = "/dev/shm"
        // etcHosts is the default path of /etc/hosts file.
        etcHosts = "/etc/hosts"
        // etcHostname is the default path of /etc/hostname file.
        etcHostname = "/etc/hostname"
        // resolvConfPath is the abs path of resolv.conf on host or container.
        resolvConfPath = "/etc/resolv.conf"
)

// getSandboxRootDir returns the root directory for managing sandbox files,
// e.g. hosts files.
func (c *criService) getSandboxRootDir(id string) string {
        return filepath.Join(c.config.RootDir, sandboxesDir, id)
}

// getVolatileSandboxRootDir returns the root directory for managing volatile sandbox files,
// e.g. named pipes.
func (c *criService) getVolatileSandboxRootDir(id string) string {
        return filepath.Join(c.config.StateDir, sandboxesDir, id)
}

// getSandboxHostname returns the hostname file path inside the sandbox root directory.
func (c *criService) getSandboxHostname(id string) string {
        return filepath.Join(c.getSandboxRootDir(id), "hostname")
}

// getSandboxHosts returns the hosts file path inside the sandbox root directory.
func (c *criService) getSandboxHosts(id string) string {
        return filepath.Join(c.getSandboxRootDir(id), "hosts")
}

// getResolvPath returns resolv.conf filepath for specified sandbox.
func (c *criService) getResolvPath(id string) string {
        return filepath.Join(c.getSandboxRootDir(id), "resolv.conf")
}

// getSandboxDevShm returns the shm file path inside the sandbox root directory.
func (c *criService) getSandboxDevShm(id string) string {
        return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
}

// makeSandboxName generates sandbox name from sandbox metadata. The name
// generated is unique as long as sandbox metadata is unique.
func makeSandboxName(s *runtime.PodSandboxMetadata) string {
        return strings.Join([]string{
                s.Name,      // 0
                s.Namespace, // 1
                s.Uid,       // 2
                strconv.FormatUint(uint64(s.Attempt), 10), // 3
        }, nameDelimiter)
}

// makeContainerName generates container name from sandbox and container metadata.
// The name generated is unique as long as the sandbox container combination is
// unique.
func makeContainerName(c *runtime.ContainerMetadata, s *runtime.PodSandboxMetadata) string {
        return strings.Join([]string{
                c.Name,      // 0: container name
                s.Name,      // 1: pod name
                s.Namespace, // 2: pod namespace
                s.Uid,       // 3: pod uid
                strconv.FormatUint(uint64(c.Attempt), 10), // 4: attempt number of creating the container
        }, nameDelimiter)
}

// getContainerRootDir returns the root directory for managing container files,
// e.g. state checkpoint.
func (c *criService) getContainerRootDir(id string) string {
        return filepath.Join(c.config.RootDir, containersDir, id)
}

// getVolatileContainerRootDir returns the root directory for managing volatile container files,
// e.g. named pipes.
func (c *criService) getVolatileContainerRootDir(id string) string {
        return filepath.Join(c.config.StateDir, containersDir, id)
}

// criContainerStateToString formats CRI container state to string.
func criContainerStateToString(state runtime.ContainerState) string {
        return runtime.ContainerState_name[int32(state)]
}

// toContainerdImage converts an image object in image store to containerd image handler.
func (c *criService) toContainerdImage(ctx context.Context, image imagestore.Image) (containerd.Image, error) {
        // image should always have at least one reference.
        if len(image.References) == 0 {
                return nil, fmt.Errorf("invalid image with no reference %q", image.ID)
        }
        return c.client.GetImage(ctx, image.References[0])
}

// getUserFromImage gets uid or user name of the image user.
// If user is numeric, it will be treated as uid; or else, it is treated as user name.
func getUserFromImage(user string) (*int64, string) {
        // return both empty if user is not specified in the image.
        if user == "" {
                return nil, ""
        }
        // split instances where the id may contain user:group
        user = strings.Split(user, ":")[0]
        // user could be either uid or user name. Try to interpret as numeric uid.
        uid, err := strconv.ParseInt(user, 10, 64)
        if err != nil {
                // If user is non numeric, assume it's user name.
                return nil, user
        }
        // If user is a numeric uid.
        return &uid, ""
}

// validateTargetContainer checks that a container is a valid
// target for a container using PID NamespaceMode_TARGET.
// The target container must be in the same sandbox and must be running.
// Returns the target container for convenience.
func (c *criService) validateTargetContainer(sandboxID, targetContainerID string) (containerstore.Container, error) {
        targetContainer, err := c.containerStore.Get(targetContainerID)
        if err != nil {
                return containerstore.Container{}, fmt.Errorf("container %q does not exist: %w", targetContainerID, err)
        }

        targetSandboxID := targetContainer.Metadata.SandboxID
        if targetSandboxID != sandboxID {
                return containerstore.Container{},
                        fmt.Errorf("container %q (sandbox %s) does not belong to sandbox %s", targetContainerID, targetSandboxID, sandboxID)
        }

        status := targetContainer.Status.Get()
        if state := status.State(); state != runtime.ContainerState_CONTAINER_RUNNING {
                return containerstore.Container{}, fmt.Errorf("container %q is not running - in state %s", targetContainerID, state)
        }

        return targetContainer, nil
}

// isInCRIMounts checks whether a destination is in CRI mount list.
func isInCRIMounts(dst string, mounts []*runtime.Mount) bool {
        for _, m := range mounts {
                if filepath.Clean(m.ContainerPath) == filepath.Clean(dst) {
                        return true
                }
        }
        return false
}

// filterLabel returns a label filter. Use `%q` here because containerd
// filter needs extra quote to work properly.
func filterLabel(k, v string) string {
        return fmt.Sprintf("labels.%q==%q", k, v)
}

// buildLabel builds the labels from config to be passed to containerd
func buildLabels(configLabels, imageConfigLabels map[string]string, containerType string) map[string]string {
        labels := make(map[string]string)

        for k, v := range imageConfigLabels {
                if err := clabels.Validate(k, v); err == nil {
                        labels[k] = v
                } else {
                        // In case the image label is invalid, we output a warning and skip adding it to the
                        // container.
                        log.L.WithError(err).Warnf("unable to add image label with key %s to the container", k)
                }
        }
        // labels from the CRI request (config) will override labels in the image config
        for k, v := range configLabels {
                labels[k] = v
        }
        labels[crilabels.ContainerKindLabel] = containerType
        return labels
}

// getRuntimeOptions get runtime options from container metadata.
func getRuntimeOptions(c containers.Container) (interface{}, error) {
        from := c.Runtime.Options
        if from == nil || from.GetValue() == nil {
                return nil, nil
        }
        opts, err := typeurl.UnmarshalAny(from)
        if err != nil {
                return nil, err
        }
        return opts, nil
}

const (
        // unknownExitCode is the exit code when exit reason is unknown.
        unknownExitCode = 255
        // unknownExitReason is the exit reason when exit reason is unknown.
        unknownExitReason = "Unknown"
)

// unknownContainerStatus returns the default container status when its status is unknown.
func unknownContainerStatus() containerstore.Status {
        return containerstore.Status{
                CreatedAt:  0,
                StartedAt:  0,
                FinishedAt: 0,
                ExitCode:   unknownExitCode,
                Reason:     unknownExitReason,
                Unknown:    true,
        }
}

// getPassthroughAnnotations filters requested pod annotations by comparing
// against permitted annotations for the given runtime.
func getPassthroughAnnotations(podAnnotations map[string]string,
        runtimePodAnnotations []string) (passthroughAnnotations map[string]string) {
        passthroughAnnotations = make(map[string]string)

        for podAnnotationKey, podAnnotationValue := range podAnnotations {
                for _, pattern := range runtimePodAnnotations {
                        // Use path.Match instead of filepath.Match here.
                        // filepath.Match treated `\\` as path separator
                        // on windows, which is not what we want.
                        if ok, _ := path.Match(pattern, podAnnotationKey); ok {
                                passthroughAnnotations[podAnnotationKey] = podAnnotationValue
                        }
                }
        }
        return passthroughAnnotations
}

// copyResourcesToStatus copys container resource contraints from spec to
// container status.
// This will need updates when new fields are added to ContainerResources.
func copyResourcesToStatus(spec *runtimespec.Spec, status containerstore.Status) containerstore.Status {
        status.Resources = &runtime.ContainerResources{}
        if spec.Linux != nil {
                status.Resources.Linux = &runtime.LinuxContainerResources{}

                if spec.Process != nil && spec.Process.OOMScoreAdj != nil {
                        status.Resources.Linux.OomScoreAdj = int64(*spec.Process.OOMScoreAdj)
                }

                if spec.Linux.Resources == nil {
                        return status
                }

                if spec.Linux.Resources.CPU != nil {
                        if spec.Linux.Resources.CPU.Period != nil {
                                status.Resources.Linux.CpuPeriod = int64(*spec.Linux.Resources.CPU.Period)
                        }
                        if spec.Linux.Resources.CPU.Quota != nil {
                                status.Resources.Linux.CpuQuota = *spec.Linux.Resources.CPU.Quota
                        }
                        if spec.Linux.Resources.CPU.Shares != nil {
                                status.Resources.Linux.CpuShares = int64(*spec.Linux.Resources.CPU.Shares)
                        }
                        status.Resources.Linux.CpusetCpus = spec.Linux.Resources.CPU.Cpus
                        status.Resources.Linux.CpusetMems = spec.Linux.Resources.CPU.Mems
                }

                if spec.Linux.Resources.Memory != nil {
                        if spec.Linux.Resources.Memory.Limit != nil {
                                status.Resources.Linux.MemoryLimitInBytes = *spec.Linux.Resources.Memory.Limit
                        }
                        if spec.Linux.Resources.Memory.Swap != nil {
                                status.Resources.Linux.MemorySwapLimitInBytes = *spec.Linux.Resources.Memory.Swap
                        }
                }

                if spec.Linux.Resources.HugepageLimits != nil {
                        hugepageLimits := make([]*runtime.HugepageLimit, 0, len(spec.Linux.Resources.HugepageLimits))
                        for _, l := range spec.Linux.Resources.HugepageLimits {
                                hugepageLimits = append(hugepageLimits, &runtime.HugepageLimit{
                                        PageSize: l.Pagesize,
                                        Limit:    l.Limit,
                                })
                        }
                        status.Resources.Linux.HugepageLimits = hugepageLimits
                }

                if spec.Linux.Resources.Unified != nil {
                        status.Resources.Linux.Unified = spec.Linux.Resources.Unified
                }
        }

        if spec.Windows != nil {
                status.Resources.Windows = &runtime.WindowsContainerResources{}
                if spec.Windows.Resources == nil {
                        return status
                }

                if spec.Windows.Resources.CPU != nil {
                        if spec.Windows.Resources.CPU.Shares != nil {
                                status.Resources.Windows.CpuShares = int64(*spec.Windows.Resources.CPU.Shares)
                        }
                        if spec.Windows.Resources.CPU.Count != nil {
                                status.Resources.Windows.CpuCount = int64(*spec.Windows.Resources.CPU.Count)
                        }
                        if spec.Windows.Resources.CPU.Maximum != nil {
                                status.Resources.Windows.CpuMaximum = int64(*spec.Windows.Resources.CPU.Maximum)
                        }
                }

                if spec.Windows.Resources.Memory != nil {
                        if spec.Windows.Resources.Memory.Limit != nil {
                                status.Resources.Windows.MemoryLimitInBytes = int64(*spec.Windows.Resources.Memory.Limit)
                        }
                }

                // TODO: Figure out how to get RootfsSizeInBytes
        }
        return status
}

func (c *criService) generateAndSendContainerEvent(ctx context.Context, containerID string, sandboxID string, eventType runtime.ContainerEventType) {
        podSandboxStatus, err := c.getPodSandboxStatus(ctx, sandboxID)
        if err != nil {
                log.G(ctx).Warnf("Failed to get podSandbox status for container event for sandboxID %q: %v. Sending the event with nil podSandboxStatus.", sandboxID, err)
                podSandboxStatus = nil
        }
        containerStatuses, err := c.getContainerStatuses(ctx, sandboxID)
        if err != nil {
                log.G(ctx).Errorf("Failed to get container statuses for container event for sandboxID %q: %v", sandboxID, err)
        }

        event := runtime.ContainerEventResponse{
                ContainerId:        containerID,
                ContainerEventType: eventType,
                CreatedAt:          time.Now().UnixNano(),
                PodSandboxStatus:   podSandboxStatus,
                ContainersStatuses: containerStatuses,
        }

        c.containerEventsQ.Send(event)
}

func (c *criService) getPodSandboxStatus(ctx context.Context, podSandboxID string) (*runtime.PodSandboxStatus, error) {
        request := &runtime.PodSandboxStatusRequest{PodSandboxId: podSandboxID}
        response, err := c.PodSandboxStatus(ctx, request)
        if err != nil {
                return nil, err
        }
        return response.GetStatus(), nil
}

func (c *criService) getContainerStatuses(ctx context.Context, podSandboxID string) ([]*runtime.ContainerStatus, error) {
        response, err := c.ListContainers(ctx, &runtime.ListContainersRequest{
                Filter: &runtime.ContainerFilter{
                        PodSandboxId: podSandboxID,
                },
        })
        if err != nil {
                return nil, err
        }
        containerStatuses := []*runtime.ContainerStatus{}
        for _, container := range response.Containers {
                statusResp, err := c.ContainerStatus(ctx, &runtime.ContainerStatusRequest{
                        ContainerId: container.Id,
                        Verbose:     false,
                })
                if err != nil {
                        if errdefs.IsNotFound(err) {
                                continue
                        }
                        return nil, err
                }
                containerStatuses = append(containerStatuses, statusResp.GetStatus())
        }
        return containerStatuses, nil
}

// hostNetwork handles checking if host networking was requested.
func hostNetwork(config *runtime.PodSandboxConfig) bool {
        var hostNet bool
        switch goruntime.GOOS {
        case "windows":
                // Windows HostProcess pods can only run on the host network
                hostNet = config.GetWindows().GetSecurityContext().GetHostProcess()
        case "darwin":
                // No CNI on Darwin yet.
                hostNet = true
        default:
                // Even on other platforms, the logic containerd uses is to check if NamespaceMode == NODE.
                // So this handles Linux, as well as any other platforms not governed by the cases above
                // that have special quirks.
                hostNet = config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE
        }
        return hostNet
}

// getCgroupsPath generates container cgroups path.
func getCgroupsPath(cgroupsParent, id string) string {
        base := path.Base(cgroupsParent)
        if strings.HasSuffix(base, ".slice") {
                // For a.slice/b.slice/c.slice, base is c.slice.
                // runc systemd cgroup path format is "slice:prefix:name".
                return strings.Join([]string{base, "cri-containerd", id}, ":")
        }
        return filepath.Join(cgroupsParent, id)
}

func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
        var labels []string

        if selinuxOptions == nil {
                return nil, nil
        }
        if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
                return nil, err
        }
        if selinuxOptions.User != "" {
                labels = append(labels, "user:"+selinuxOptions.User)
        }
        if selinuxOptions.Role != "" {
                labels = append(labels, "role:"+selinuxOptions.Role)
        }
        if selinuxOptions.Type != "" {
                labels = append(labels, "type:"+selinuxOptions.Type)
        }
        if selinuxOptions.Level != "" {
                labels = append(labels, "level:"+selinuxOptions.Level)
        }

        return labels, nil
}

func checkSelinuxLevel(level string) error {
        if len(level) == 0 {
                return nil
        }

        matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
        if err != nil {
                return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
        }
        if !matched {
                return fmt.Errorf("the format of 'level' %q is not correct", level)
        }
        return nil
}

func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
        var m []runtimespec.LinuxIDMapping

        if len(runtimeIDMap) == 0 {
                return m, nil
        }

        if len(runtimeIDMap) > 1 {
                // We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
                return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
        }

        // We know len is 1 now.
        if runtimeIDMap[0] == nil {
                return m, nil
        }
        uidMap := *runtimeIDMap[0]

        if uidMap.Length < 1 {
                return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
        }

        m = []runtimespec.LinuxIDMapping{
                {
                        ContainerID: uidMap.ContainerId,
                        HostID:      uidMap.HostId,
                        Size:        uidMap.Length,
                },
        }

        return m, nil
}

func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
        if userns == nil {
                // If userns is not set, the kubelet doesn't support this option
                // and we should just fallback to no userns. This is completely
                // valid.
                return nil, nil, nil
        }

        uids, err := parseUsernsIDMap(userns.GetUids())
        if err != nil {
                return nil, nil, fmt.Errorf("UID mapping: %w", err)
        }

        gids, err = parseUsernsIDMap(userns.GetGids())
        if err != nil {
                return nil, nil, fmt.Errorf("GID mapping: %w", err)
        }

        switch mode := userns.GetMode(); mode {
        case runtime.NamespaceMode_NODE:
                if len(uids) != 0 || len(gids) != 0 {
                        return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
                }
        case runtime.NamespaceMode_POD:
                // This is valid, we will handle it in WithPodNamespaces().
                if len(uids) == 0 || len(gids) == 0 {
                        return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
                }
        default:
                return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
        }

        return uids, gids, nil
}

// sameUsernsConfig checks if the userns configs are the same. If the mappings
// on each config are the same but in different order, it returns false.
// XXX: If the runtime.UserNamespace struct changes, we should update this
// function accordingly.
func sameUsernsConfig(a, b *runtime.UserNamespace) bool {
        // If both are nil, they are the same.
        if a == nil && b == nil {
                return true
        }
        // If only one is nil, they are different.
        if a == nil || b == nil {
                return false
        }
        // At this point, a is not nil nor b.

        if a.GetMode() != b.GetMode() {
                return false
        }

        aUids, aGids, err := parseUsernsIDs(a)
        if err != nil {
                return false
        }
        bUids, bGids, err := parseUsernsIDs(b)
        if err != nil {
                return false
        }

        if !sameMapping(aUids, bUids) {
                return false
        }
        if !sameMapping(aGids, bGids) {
                return false
        }
        return true
}

// sameMapping checks if the mappings are the same. If the mappings are the same
// but in different order, it returns false.
func sameMapping(a, b []runtimespec.LinuxIDMapping) bool {
        if len(a) != len(b) {
                return false
        }

        for x := range a {
                if a[x].ContainerID != b[x].ContainerID {
                        return false
                }
                if a[x].HostID != b[x].HostID {
                        return false
                }
                if a[x].Size != b[x].Size {
                        return false
                }
        }
        return true
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/cgroups/v3"
        "github.com/containerd/log"
        "github.com/moby/sys/mountinfo"
        "github.com/opencontainers/runtime-spec/specs-go"
        "golang.org/x/sys/unix"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/cri/seutil"
        "github.com/containerd/containerd/v2/pkg/apparmor"
        "github.com/containerd/containerd/v2/pkg/seccomp"
)

// apparmorEnabled returns true if apparmor is enabled, supported by the host,
// if apparmor_parser is installed, and if we are not running docker-in-docker.
func (c *criService) apparmorEnabled() bool {
        if c.config.DisableApparmor {
                return false
        }
        return apparmor.HostSupports()
}

func (c *criService) seccompEnabled() bool {
        return seccomp.IsEnabled()
}

// openLogFile opens/creates a container log file.
func openLogFile(path string) (*os.File, error) {
        if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
                return nil, err
        }
        return os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0640)
}

// unmountRecursive unmounts the target and all mounts underneath, starting with
// the deepest mount first.
func unmountRecursive(ctx context.Context, target string) error {
        target, err := mount.CanonicalizePath(target)
        if err != nil {
                return err
        }

        toUnmount, err := mountinfo.GetMounts(mountinfo.PrefixFilter(target))
        if err != nil {
                return err
        }

        // Make the deepest mount be first
        sort.Slice(toUnmount, func(i, j int) bool {
                return len(toUnmount[i].Mountpoint) > len(toUnmount[j].Mountpoint)
        })

        for i, m := range toUnmount {
                if err := mount.UnmountAll(m.Mountpoint, unix.MNT_DETACH); err != nil {
                        if i == len(toUnmount)-1 { // last mount
                                return err
                        }
                        // This is some submount, we can ignore this error for now, the final unmount will fail if this is a real problem
                        log.G(ctx).WithError(err).Debugf("failed to unmount submount %s", m.Mountpoint)
                }
        }
        return nil
}

// ensureRemoveAll wraps `os.RemoveAll` to check for specific errors that can
// often be remedied.
// Only use `ensureRemoveAll` if you really want to make every effort to remove
// a directory.
//
// Because of the way `os.Remove` (and by extension `os.RemoveAll`) works, there
// can be a race between reading directory entries and then actually attempting
// to remove everything in the directory.
// These types of errors do not need to be returned since it's ok for the dir to
// be gone we can just retry the remove operation.
//
// This should not return a `os.ErrNotExist` kind of error under any circumstances
func ensureRemoveAll(ctx context.Context, dir string) error {
        notExistErr := make(map[string]bool)

        // track retries
        exitOnErr := make(map[string]int)
        maxRetry := 50

        // Attempt to unmount anything beneath this dir first.
        if err := unmountRecursive(ctx, dir); err != nil {
                log.G(ctx).WithError(err).Debugf("failed to do initial unmount of %s", dir)
        }

        for {
                err := os.RemoveAll(dir)
                if err == nil {
                        return nil
                }

                pe, ok := err.(*os.PathError)
                if !ok {
                        return err
                }

                if os.IsNotExist(err) {
                        if notExistErr[pe.Path] {
                                return err
                        }
                        notExistErr[pe.Path] = true

                        // There is a race where some subdir can be removed but after the
                        // parent dir entries have been read.
                        // So the path could be from `os.Remove(subdir)`
                        // If the reported non-existent path is not the passed in `dir` we
                        // should just retry, but otherwise return with no error.
                        if pe.Path == dir {
                                return nil
                        }
                        continue
                }

                if pe.Err != syscall.EBUSY {
                        return err
                }
                if e := mount.Unmount(pe.Path, unix.MNT_DETACH); e != nil {
                        return fmt.Errorf("error while removing %s: %w", dir, e)
                }

                if exitOnErr[pe.Path] == maxRetry {
                        return err
                }
                exitOnErr[pe.Path]++
                time.Sleep(100 * time.Millisecond)
        }
}

var vmbasedRuntimes = []string{
        "io.containerd.kata",
}

func isVMBasedRuntime(runtimeType string) bool {
        for _, rt := range vmbasedRuntimes {
                if strings.Contains(runtimeType, rt) {
                        return true
                }
        }
        return false
}

func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
        if !isVMBasedRuntime(runtimeType) {
                return nil
        }
        l, err := seutil.ChangeToKVM(spec.Process.SelinuxLabel)
        if err != nil {
                return fmt.Errorf("failed to get selinux kvm label: %w", err)
        }
        spec.Process.SelinuxLabel = l
        return nil
}

// getCgroupsMode returns cgropu mode.
// TODO: add build constraints to cgroups package and remove this helper
func isUnifiedCgroupsMode() bool {
        return cgroups.Mode() == cgroups.Unified
}

func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
        snapshotOpt := []snapshots.Opt{}
        usernsOpts := nsOpts.GetUsernsOptions()
        if usernsOpts == nil {
                return snapshotOpt, nil
        }

        uids, gids, err := parseUsernsIDs(usernsOpts)
        if err != nil {
                return nil, fmt.Errorf("user namespace configuration: %w", err)
        }

        if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
                snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
        }
        return snapshotOpt, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
)

// LoadImages checks all existing images to ensure they are ready to
// be used for CRI. It may try to recover images which are not ready
// but will only log errors, not return any.
func (c *CRIImageService) CheckImages(ctx context.Context) error {
        // TODO: Move way from `client.ListImages` to directly using image store
        cImages, err := c.client.ListImages(ctx)
        if err != nil {
                return fmt.Errorf("unable to list images: %w", err)
        }

        // TODO: Support all snapshotter
        snapshotter := c.config.Snapshotter
        var wg sync.WaitGroup
        for _, i := range cImages {
                wg.Add(1)
                i := i
                go func() {
                        defer wg.Done()
                        // TODO: Check platform/snapshot combination. Snapshot check should come first
                        ok, _, _, _, err := images.Check(ctx, i.ContentStore(), i.Target(), platforms.Default())
                        if err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to check image content readiness for %q", i.Name())
                                return
                        }
                        if !ok {
                                log.G(ctx).Warnf("The image content readiness for %q is not ok", i.Name())
                                return
                        }
                        // Checking existence of top-level snapshot for each image being recovered.
                        // TODO: This logic should be done elsewhere and owned by the image service
                        unpacked, err := i.IsUnpacked(ctx, snapshotter)
                        if err != nil {
                                log.G(ctx).WithError(err).Warnf("Failed to check whether image is unpacked for image %s", i.Name())
                                return
                        }
                        if !unpacked {
                                log.G(ctx).Warnf("The image %s is not unpacked.", i.Name())
                                // TODO(random-liu): Consider whether we should try unpack here.
                        }
                        if err := c.UpdateImage(ctx, i.Name()); err != nil {
                                log.G(ctx).WithError(err).Warnf("Failed to update reference for image %q", i.Name())
                                return
                        }
                        log.G(ctx).Debugf("Loaded image %q", i.Name())
                }()
        }
        wg.Wait()
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ListImages lists existing images.
// TODO(random-liu): Add image list filters after CRI defines this more clear, and kubelet
// actually needs it.
func (c *GRPCCRIImageService) ListImages(ctx context.Context, r *runtime.ListImagesRequest) (*runtime.ListImagesResponse, error) {
        // TODO: From CRIImageService directly
        imagesInStore := c.imageStore.List()

        var images []*runtime.Image
        for _, image := range imagesInStore {
                // TODO(random-liu): [P0] Make sure corresponding snapshot exists. What if snapshot
                // doesn't exist?
                images = append(images, toCRIImage(image))
        }

        return &runtime.ListImagesResponse{Images: images}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "crypto/tls"
        "encoding/base64"
        "fmt"
        "io"
        "net"
        "net/http"
        "net/url"
        "path/filepath"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/log"
        distribution "github.com/distribution/reference"
        imagedigest "github.com/opencontainers/go-digest"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/diff"
        containerdimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/containerd/v2/core/remotes/docker/config"
        "github.com/containerd/containerd/v2/internal/cri/annotations"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        snpkg "github.com/containerd/containerd/v2/pkg/snapshotters"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/errdefs"
)

// For image management:
// 1) We have an in-memory metadata index to:
//   a. Maintain ImageID -> RepoTags, ImageID -> RepoDigset relationships; ImageID
//   is the digest of image config, which conforms to oci image spec.
//   b. Cache constant and useful information such as image chainID, config etc.
//   c. An image will be added into the in-memory metadata only when it's successfully
//   pulled and unpacked.
//
// 2) We use containerd image metadata store and content store:
//   a. To resolve image reference (digest/tag) locally. During pulling image, we
//   normalize the image reference provided by user, and put it into image metadata
//   store with resolved descriptor. For the other operations, if image id is provided,
//   we'll access the in-memory metadata index directly; if image reference is
//   provided, we'll normalize it, resolve it in containerd image metadata store
//   to get the image id.
//   b. As the backup of in-memory metadata in 1). During startup, the in-memory
//   metadata could be re-constructed from image metadata store + content store.
//
// Several problems with current approach:
// 1) An entry in containerd image metadata store doesn't mean a "READY" (successfully
// pulled and unpacked) image. E.g. during pulling, the client gets killed. In that case,
// if we saw an image without snapshots or with in-complete contents during startup,
// should we re-pull the image? Or should we remove the entry?
//
// yanxuean: We can't delete image directly, because we don't know if the image
// is pulled by us. There are resource leakage.
//
// 2) Containerd suggests user to add entry before pulling the image. However if
// an error occurs during the pulling, should we remove the entry from metadata
// store? Or should we leave it there until next startup (resource leakage)?
//
// 3) The cri plugin only exposes "READY" (successfully pulled and unpacked) images
// to the user, which are maintained in the in-memory metadata index. However, it's
// still possible that someone else removes the content or snapshot by-pass the cri plugin,
// how do we detect that and update the in-memory metadata correspondingly? Always
// check whether corresponding snapshot is ready when reporting image status?
//
// 4) Is the content important if we cached necessary information in-memory
// after we pull the image? How to manage the disk usage of contents? If some
// contents are missing but snapshots are ready, is the image still "READY"?

// PullImage pulls an image with authentication config.
func (c *GRPCCRIImageService) PullImage(ctx context.Context, r *runtime.PullImageRequest) (_ *runtime.PullImageResponse, err error) {

        imageRef := r.GetImage().GetImage()

        credentials := func(host string) (string, string, error) {
                hostauth := r.GetAuth()
                if hostauth == nil {
                        config := c.config.Registry.Configs[host]
                        if config.Auth != nil {
                                hostauth = toRuntimeAuthConfig(*config.Auth)
                        }
                }
                return ParseAuth(hostauth, host)
        }

        ref, err := c.CRIImageService.PullImage(ctx, imageRef, credentials, r.SandboxConfig, r.GetImage().GetRuntimeHandler())
        if err != nil {
                return nil, err
        }
        return &runtime.PullImageResponse{ImageRef: ref}, nil
}

func (c *CRIImageService) PullImage(ctx context.Context, name string, credentials func(string) (string, string, error), sandboxConfig *runtime.PodSandboxConfig, runtimeHandler string) (_ string, err error) {
        span := tracing.SpanFromContext(ctx)
        defer func() {
                // TODO: add domain label for imagePulls metrics, and we may need to provide a mechanism
                // for the user to configure the set of registries that they are interested in.
                if err != nil {
                        imagePulls.WithValues("failure").Inc()
                } else {
                        imagePulls.WithValues("success").Inc()
                }
        }()

        inProgressImagePulls.Inc()
        defer inProgressImagePulls.Dec()
        startTime := time.Now()

        namedRef, err := distribution.ParseDockerRef(name)
        if err != nil {
                return "", fmt.Errorf("failed to parse image reference %q: %w", name, err)
        }
        ref := namedRef.String()
        if ref != name {
                log.G(ctx).Debugf("PullImage using normalized image ref: %q", ref)
        }

        imagePullProgressTimeout, err := time.ParseDuration(c.config.ImagePullProgressTimeout)
        if err != nil {
                return "", fmt.Errorf("failed to parse image_pull_progress_timeout %q: %w", c.config.ImagePullProgressTimeout, err)
        }

        var (
                pctx, pcancel = context.WithCancel(ctx)

                pullReporter = newPullProgressReporter(ref, pcancel, imagePullProgressTimeout)

                resolver = docker.NewResolver(docker.ResolverOptions{
                        Headers: c.config.Registry.Headers,
                        Hosts:   c.registryHosts(ctx, credentials, pullReporter.optionUpdateClient),
                })
                isSchema1    bool
                imageHandler containerdimages.HandlerFunc = func(_ context.Context,
                        desc imagespec.Descriptor) ([]imagespec.Descriptor, error) {
                        if desc.MediaType == containerdimages.MediaTypeDockerSchema1Manifest {
                                isSchema1 = true
                        }
                        return nil, nil
                }
        )

        defer pcancel()
        snapshotter, err := c.snapshotterFromPodSandboxConfig(ctx, ref, sandboxConfig)
        if err != nil {
                return "", err
        }
        log.G(ctx).Debugf("PullImage %q with snapshotter %s", ref, snapshotter)
        span.SetAttributes(
                tracing.Attribute("image.ref", ref),
                tracing.Attribute("snapshotter.name", snapshotter),
        )

        labels := c.getLabels(ctx, ref)

        pullOpts := []containerd.RemoteOpt{
                containerd.WithSchema1Conversion, //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
                containerd.WithResolver(resolver),
                containerd.WithPullSnapshotter(snapshotter),
                containerd.WithPullUnpack,
                containerd.WithPullLabels(labels),
                containerd.WithMaxConcurrentDownloads(c.config.MaxConcurrentDownloads),
                containerd.WithImageHandler(imageHandler),
                containerd.WithUnpackOpts([]containerd.UnpackOpt{
                        containerd.WithUnpackDuplicationSuppressor(c.unpackDuplicationSuppressor),
                        containerd.WithUnpackApplyOpts(diff.WithSyncFs(c.config.ImagePullWithSyncFs)),
                }),
        }

        // Temporarily removed for v2 upgrade
        //pullOpts = append(pullOpts, c.encryptedImagesPullOpts()...)
        if !c.config.DisableSnapshotAnnotations {
                pullOpts = append(pullOpts,
                        containerd.WithImageHandlerWrapper(snpkg.AppendInfoHandlerWrapper(ref)))
        }

        if c.config.DiscardUnpackedLayers {
                // Allows GC to clean layers up from the content store after unpacking
                pullOpts = append(pullOpts,
                        containerd.WithChildLabelMap(containerdimages.ChildGCLabelsFilterLayers))
        }

        pullReporter.start(pctx)
        image, err := c.client.Pull(pctx, ref, pullOpts...)
        pcancel()
        if err != nil {
                return "", fmt.Errorf("failed to pull and unpack image %q: %w", ref, err)
        }
        span.AddEvent("Pull and unpack image complete")

        configDesc, err := image.Config(ctx)
        if err != nil {
                return "", fmt.Errorf("get image config descriptor: %w", err)
        }
        imageID := configDesc.Digest.String()

        repoDigest, repoTag := getRepoDigestAndTag(namedRef, image.Target().Digest, isSchema1)
        for _, r := range []string{imageID, repoTag, repoDigest} {
                if r == "" {
                        continue
                }
                if err := c.createOrUpdateImageReference(ctx, r, image.Target(), labels); err != nil {
                        return "", fmt.Errorf("failed to create image reference %q: %w", r, err)
                }
                // Update image store to reflect the newest state in containerd.
                // No need to use `updateImage`, because the image reference must
                // have been managed by the cri plugin.
                // TODO: Use image service directly
                if err := c.imageStore.Update(ctx, r); err != nil {
                        return "", fmt.Errorf("failed to update image store %q: %w", r, err)
                }
        }

        const mbToByte = 1024 * 1024
        size, _ := image.Size(ctx)
        imagePullingSpeed := float64(size) / mbToByte / time.Since(startTime).Seconds()
        imagePullThroughput.Observe(imagePullingSpeed)

        log.G(ctx).Infof("Pulled image %q with image id %q, repo tag %q, repo digest %q, size %q in %s", name, imageID,
                repoTag, repoDigest, strconv.FormatInt(size, 10), time.Since(startTime))
        // NOTE(random-liu): the actual state in containerd is the source of truth, even we maintain
        // in-memory image store, it's only for in-memory indexing. The image could be removed
        // by someone else anytime, before/during/after we create the metadata. We should always
        // check the actual state in containerd before using the image or returning status of the
        // image.
        return imageID, nil
}

// getRepoDigestAngTag returns image repoDigest and repoTag of the named image reference.
func getRepoDigestAndTag(namedRef distribution.Named, digest imagedigest.Digest, schema1 bool) (string, string) {
        var repoTag, repoDigest string
        if _, ok := namedRef.(distribution.NamedTagged); ok {
                repoTag = namedRef.String()
        }
        if _, ok := namedRef.(distribution.Canonical); ok {
                repoDigest = namedRef.String()
        } else if !schema1 {
                // digest is not actual repo digest for schema1 image.
                repoDigest = namedRef.Name() + "@" + digest.String()
        }
        return repoDigest, repoTag
}

// ParseAuth parses AuthConfig and returns username and password/secret required by containerd.
func ParseAuth(auth *runtime.AuthConfig, host string) (string, string, error) {
        if auth == nil {
                return "", "", nil
        }
        if auth.ServerAddress != "" {
                // Do not return the auth info when server address doesn't match.
                u, err := url.Parse(auth.ServerAddress)
                if err != nil {
                        return "", "", fmt.Errorf("parse server address: %w", err)
                }
                if host != u.Host {
                        return "", "", nil
                }
        }
        if auth.Username != "" {
                return auth.Username, auth.Password, nil
        }
        if auth.IdentityToken != "" {
                return "", auth.IdentityToken, nil
        }
        if auth.Auth != "" {
                decLen := base64.StdEncoding.DecodedLen(len(auth.Auth))
                decoded := make([]byte, decLen)
                _, err := base64.StdEncoding.Decode(decoded, []byte(auth.Auth))
                if err != nil {
                        return "", "", err
                }
                user, passwd, ok := strings.Cut(string(decoded), ":")
                if !ok {
                        return "", "", fmt.Errorf("invalid decoded auth: %q", decoded)
                }
                return user, strings.Trim(passwd, "\x00"), nil
        }
        // TODO(random-liu): Support RegistryToken.
        // An empty auth config is valid for anonymous registry
        return "", "", nil
}

// createOrUpdateImageReference creates or updates image reference inside containerd image store.
// Note that because create and update are not finished in one transaction, there could be race. E.g.
// the image reference is deleted by someone else after create returns already exists, but before update
// happens.
func (c *CRIImageService) createOrUpdateImageReference(ctx context.Context, name string, desc imagespec.Descriptor, labels map[string]string) error {
        img := containerdimages.Image{
                Name:   name,
                Target: desc,
                // Add a label to indicate that the image is managed by the cri plugin.
                Labels: labels,
        }
        // TODO(random-liu): Figure out which is the more performant sequence create then update or
        // update then create.
        // TODO: Call CRIImageService directly
        _, err := c.images.Create(ctx, img)
        if err == nil {
                return nil
        } else if !errdefs.IsAlreadyExists(err) {
                return err
        }
        // Retrieve oldImg from image store here because Create routine returns an
        // empty image on ErrAlreadyExists
        oldImg, err := c.images.Get(ctx, name)
        if err != nil {
                return err
        }
        fieldpaths := []string{"target"}
        if oldImg.Labels[crilabels.ImageLabelKey] != labels[crilabels.ImageLabelKey] {
                fieldpaths = append(fieldpaths, "labels."+crilabels.ImageLabelKey)
        }
        if oldImg.Labels[crilabels.PinnedImageLabelKey] != labels[crilabels.PinnedImageLabelKey] &&
                labels[crilabels.PinnedImageLabelKey] == crilabels.PinnedImageLabelValue {
                fieldpaths = append(fieldpaths, "labels."+crilabels.PinnedImageLabelKey)
        }
        if oldImg.Target.Digest == img.Target.Digest && len(fieldpaths) < 2 {
                return nil
        }
        _, err = c.images.Update(ctx, img, fieldpaths...)
        return err
}

// getLabels get image labels to be added on CRI image
func (c *CRIImageService) getLabels(ctx context.Context, name string) map[string]string {
        labels := map[string]string{crilabels.ImageLabelKey: crilabels.ImageLabelValue}
        for _, pinned := range c.config.PinnedImages {
                if pinned == name {
                        labels[crilabels.PinnedImageLabelKey] = crilabels.PinnedImageLabelValue
                }
        }
        return labels
}

// updateImage updates image store to reflect the newest state of an image reference
// in containerd. If the reference is not managed by the cri plugin, the function also
// generates necessary metadata for the image and make it managed.
func (c *CRIImageService) UpdateImage(ctx context.Context, r string) error {
        // TODO: Use image service
        img, err := c.client.GetImage(ctx, r)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("get image by reference: %w", err)
                }
                // If the image is not found, we should continue updating the cache,
                // so that the image can be removed from the cache.
                if err := c.imageStore.Update(ctx, r); err != nil {
                        return fmt.Errorf("update image store for %q: %w", r, err)
                }
                return nil
        }

        labels := img.Labels()
        criLabels := c.getLabels(ctx, r)
        for key, value := range criLabels {
                if labels[key] != value {
                        // Make sure the image has the image id as its unique
                        // identifier that references the image in its lifetime.
                        configDesc, err := img.Config(ctx)
                        if err != nil {
                                return fmt.Errorf("get image id: %w", err)
                        }
                        id := configDesc.Digest.String()
                        if err := c.createOrUpdateImageReference(ctx, id, img.Target(), criLabels); err != nil {
                                return fmt.Errorf("create image id reference %q: %w", id, err)
                        }
                        if err := c.imageStore.Update(ctx, id); err != nil {
                                return fmt.Errorf("update image store for %q: %w", id, err)
                        }
                        // The image id is ready, add the label to mark the image as managed.
                        if err := c.createOrUpdateImageReference(ctx, r, img.Target(), criLabels); err != nil {
                                return fmt.Errorf("create managed label: %w", err)
                        }
                        break
                }
        }
        if err := c.imageStore.Update(ctx, r); err != nil {
                return fmt.Errorf("update image store for %q: %w", r, err)
        }
        return nil
}

func hostDirFromRoots(roots []string) func(string) (string, error) {
        rootfn := make([]func(string) (string, error), len(roots))
        for i := range roots {
                rootfn[i] = config.HostDirFromRoot(roots[i])
        }
        return func(host string) (dir string, err error) {
                for _, fn := range rootfn {
                        dir, err = fn(host)
                        if (err != nil && !errdefs.IsNotFound(err)) || (dir != "") {
                                break
                        }
                }
                return
        }
}

// registryHosts is the registry hosts to be used by the resolver.
func (c *CRIImageService) registryHosts(ctx context.Context, credentials func(host string) (string, string, error), updateClientFn config.UpdateClientFunc) docker.RegistryHosts {
        paths := filepath.SplitList(c.config.Registry.ConfigPath)
        if len(paths) > 0 {
                hostOptions := config.HostOptions{
                        UpdateClient: updateClientFn,
                }
                hostOptions.Credentials = credentials
                hostOptions.HostDir = hostDirFromRoots(paths)

                return config.ConfigureHosts(ctx, hostOptions)
        }

        return func(host string) ([]docker.RegistryHost, error) {
                var registries []docker.RegistryHost

                endpoints, err := c.registryEndpoints(host)
                if err != nil {
                        return nil, fmt.Errorf("get registry endpoints: %w", err)
                }
                for _, e := range endpoints {
                        u, err := url.Parse(e)
                        if err != nil {
                                return nil, fmt.Errorf("parse registry endpoint %q from mirrors: %w", e, err)
                        }

                        var (
                                transport = newTransport()
                                client    = &http.Client{Transport: transport}
                                config    = c.config.Registry.Configs[u.Host]
                        )

                        if docker.IsLocalhost(host) && u.Scheme == "http" {
                                // Skipping TLS verification for localhost
                                transport.TLSClientConfig = &tls.Config{
                                        InsecureSkipVerify: true,
                                }
                        }

                        // Make a copy of `credentials`, so that different authorizers would not reference
                        // the same credentials variable.
                        credentials := credentials
                        if credentials == nil && config.Auth != nil {
                                auth := toRuntimeAuthConfig(*config.Auth)
                                credentials = func(host string) (string, string, error) {
                                        return ParseAuth(auth, host)
                                }

                        }

                        if updateClientFn != nil {
                                if err := updateClientFn(client); err != nil {
                                        return nil, fmt.Errorf("failed to update http client: %w", err)
                                }
                        }

                        authorizer := docker.NewDockerAuthorizer(
                                docker.WithAuthClient(client),
                                docker.WithAuthCreds(credentials))

                        if u.Path == "" {
                                u.Path = "/v2"
                        }

                        registries = append(registries, docker.RegistryHost{
                                Client:       client,
                                Authorizer:   authorizer,
                                Host:         u.Host,
                                Scheme:       u.Scheme,
                                Path:         u.Path,
                                Capabilities: docker.HostCapabilityResolve | docker.HostCapabilityPull,
                        })
                }
                return registries, nil
        }
}

// toRuntimeAuthConfig converts cri plugin auth config to runtime auth config.
func toRuntimeAuthConfig(a criconfig.AuthConfig) *runtime.AuthConfig {
        return &runtime.AuthConfig{
                Username:      a.Username,
                Password:      a.Password,
                Auth:          a.Auth,
                IdentityToken: a.IdentityToken,
        }
}

// defaultScheme returns the default scheme for a registry host.
func defaultScheme(host string) string {
        if docker.IsLocalhost(host) {
                return "http"
        }
        return "https"
}

// addDefaultScheme returns the endpoint with default scheme
func addDefaultScheme(endpoint string) (string, error) {
        if strings.Contains(endpoint, "://") {
                return endpoint, nil
        }
        ue := "dummy://" + endpoint
        u, err := url.Parse(ue)
        if err != nil {
                return "", err
        }
        return fmt.Sprintf("%s://%s", defaultScheme(u.Host), endpoint), nil
}

// registryEndpoints returns endpoints for a given host.
// It adds default registry endpoint if it does not exist in the passed-in endpoint list.
// It also supports wildcard host matching with `*`.
func (c *CRIImageService) registryEndpoints(host string) ([]string, error) {
        var endpoints []string
        _, ok := c.config.Registry.Mirrors[host]
        if ok {
                endpoints = c.config.Registry.Mirrors[host].Endpoints
        } else {
                endpoints = c.config.Registry.Mirrors["*"].Endpoints
        }
        defaultHost, err := docker.DefaultHost(host)
        if err != nil {
                return nil, fmt.Errorf("get default host: %w", err)
        }
        for i := range endpoints {
                en, err := addDefaultScheme(endpoints[i])
                if err != nil {
                        return nil, fmt.Errorf("parse endpoint url: %w", err)
                }
                endpoints[i] = en
        }
        for _, e := range endpoints {
                u, err := url.Parse(e)
                if err != nil {
                        return nil, fmt.Errorf("parse endpoint url: %w", err)
                }
                if u.Host == host {
                        // Do not add default if the endpoint already exists.
                        return endpoints, nil
                }
        }
        return append(endpoints, defaultScheme(defaultHost)+"://"+defaultHost), nil
}

// newTransport returns a new HTTP transport used to pull image.
// TODO(random-liu): Create a library and share this code with `ctr`.
func newTransport() *http.Transport {
        return &http.Transport{
                Proxy: http.ProxyFromEnvironment,
                DialContext: (&net.Dialer{
                        Timeout:       30 * time.Second,
                        KeepAlive:     30 * time.Second,
                        FallbackDelay: 300 * time.Millisecond,
                }).DialContext,
                MaxIdleConns:          10,
                IdleConnTimeout:       30 * time.Second,
                TLSHandshakeTimeout:   10 * time.Second,
                ExpectContinueTimeout: 5 * time.Second,
        }
}

// encryptedImagesPullOpts returns the necessary list of pull options required
// for decryption of encrypted images based on the cri decryption configuration.
// Temporarily removed for v2 upgrade
//func (c *CRIImageService) encryptedImagesPullOpts() []containerd.RemoteOpt {
//        if c.config.ImageDecryption.KeyModel == criconfig.KeyModelNode {
//                ltdd := imgcrypt.Payload{}
//                decUnpackOpt := encryption.WithUnpackConfigApplyOpts(encryption.WithDecryptedUnpack(&ltdd))
//                opt := containerd.WithUnpackOpts([]containerd.UnpackOpt{decUnpackOpt})
//                return []containerd.RemoteOpt{opt}
//        }
//        return nil
//}

const (
        // defaultPullProgressReportInterval represents that how often the
        // reporter checks that pull progress.
        defaultPullProgressReportInterval = 10 * time.Second
)

// pullProgressReporter is used to check single PullImage progress.
type pullProgressReporter struct {
        ref         string
        cancel      context.CancelFunc
        reqReporter pullRequestReporter
        timeout     time.Duration
}

func newPullProgressReporter(ref string, cancel context.CancelFunc, timeout time.Duration) *pullProgressReporter {
        return &pullProgressReporter{
                ref:         ref,
                cancel:      cancel,
                reqReporter: pullRequestReporter{},
                timeout:     timeout,
        }
}

func (reporter *pullProgressReporter) optionUpdateClient(client *http.Client) error {
        client.Transport = &pullRequestReporterRoundTripper{
                rt:          client.Transport,
                reqReporter: &reporter.reqReporter,
        }
        return nil
}

func (reporter *pullProgressReporter) start(ctx context.Context) {
        if reporter.timeout == 0 {
                log.G(ctx).Infof("no timeout and will not start pulling image %s reporter", reporter.ref)
                return
        }

        go func() {
                var (
                        reportInterval = defaultPullProgressReportInterval

                        lastSeenBytesRead = uint64(0)
                        lastSeenTimestamp = time.Now()
                )

                // check progress more frequently if timeout < default internal
                if reporter.timeout < reportInterval {
                        reportInterval = reporter.timeout / 2
                }

                var ticker = time.NewTicker(reportInterval)
                defer ticker.Stop()

                for {
                        select {
                        case <-ticker.C:
                                activeReqs, bytesRead := reporter.reqReporter.status()

                                log.G(ctx).WithField("ref", reporter.ref).
                                        WithField("activeReqs", activeReqs).
                                        WithField("totalBytesRead", bytesRead).
                                        WithField("lastSeenBytesRead", lastSeenBytesRead).
                                        WithField("lastSeenTimestamp", lastSeenTimestamp.Format(time.RFC3339)).
                                        WithField("reportInterval", reportInterval).
                                        Debugf("progress for image pull")

                                if activeReqs == 0 || bytesRead > lastSeenBytesRead {
                                        lastSeenBytesRead = bytesRead
                                        lastSeenTimestamp = time.Now()
                                        continue
                                }

                                if time.Since(lastSeenTimestamp) > reporter.timeout {
                                        log.G(ctx).Errorf("cancel pulling image %s because of no progress in %v", reporter.ref, reporter.timeout)
                                        reporter.cancel()
                                        return
                                }
                        case <-ctx.Done():
                                activeReqs, bytesRead := reporter.reqReporter.status()
                                log.G(ctx).Infof("stop pulling image %s: active requests=%v, bytes read=%v", reporter.ref, activeReqs, bytesRead)
                                return
                        }
                }
        }()
}

// countingReadCloser wraps http.Response.Body with pull request reporter,
// which is used by pullRequestReporterRoundTripper.
type countingReadCloser struct {
        once sync.Once

        rc          io.ReadCloser
        reqReporter *pullRequestReporter
}

// Read reads bytes from original io.ReadCloser and increases bytes in
// pull request reporter.
func (r *countingReadCloser) Read(p []byte) (int, error) {
        n, err := r.rc.Read(p)
        r.reqReporter.incByteRead(uint64(n))
        return n, err
}

// Close closes the original io.ReadCloser and only decreases the number of
// active pull requests once.
func (r *countingReadCloser) Close() error {
        err := r.rc.Close()
        r.once.Do(r.reqReporter.decRequest)
        return err
}

// pullRequestReporter is used to track the progress per each criapi.PullImage.
type pullRequestReporter struct {
        // activeReqs indicates that current number of active pulling requests,
        // including auth requests.
        activeReqs int32
        // totalBytesRead indicates that the total bytes has been read from
        // remote registry.
        totalBytesRead uint64
}

func (reporter *pullRequestReporter) incRequest() {
        atomic.AddInt32(&reporter.activeReqs, 1)
}

func (reporter *pullRequestReporter) decRequest() {
        atomic.AddInt32(&reporter.activeReqs, -1)
}

func (reporter *pullRequestReporter) incByteRead(nr uint64) {
        atomic.AddUint64(&reporter.totalBytesRead, nr)
}

func (reporter *pullRequestReporter) status() (currentReqs int32, totalBytesRead uint64) {
        currentReqs = atomic.LoadInt32(&reporter.activeReqs)
        totalBytesRead = atomic.LoadUint64(&reporter.totalBytesRead)
        return currentReqs, totalBytesRead
}

// pullRequestReporterRoundTripper wraps http.RoundTripper with pull request
// reporter which is used to track the progress of active http request with
// counting readable http.Response.Body.
//
// NOTE:
//
// Although containerd provides ingester manager to track the progress
// of pulling request, for example `ctr image pull` shows the console progress
// bar, it needs more CPU resources to open/read the ingested files with
// acquiring containerd metadata plugin's boltdb lock.
//
// Before sending HTTP request to registry, the containerd.Client.Pull library
// will open writer by containerd ingester manager. Based on this, the
// http.RoundTripper wrapper can track the active progress with lower overhead
// even if the ref has been locked in ingester manager by other Pull request.
type pullRequestReporterRoundTripper struct {
        rt http.RoundTripper

        reqReporter *pullRequestReporter
}

func (rt *pullRequestReporterRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
        rt.reqReporter.incRequest()

        resp, err := rt.rt.RoundTrip(req)
        if err != nil {
                rt.reqReporter.decRequest()
                return nil, err
        }

        resp.Body = &countingReadCloser{
                rc:          resp.Body,
                reqReporter: rt.reqReporter,
        }
        return resp, err
}

// Given that runtime information is not passed from PullImageRequest, we depend on an experimental annotation
// passed from pod sandbox config to get the runtimeHandler. The annotation key is specified in configuration.
// Once we know the runtime, try to override default snapshotter if it is set for this runtime.
// See https://github.com/containerd/containerd/issues/6657
func (c *CRIImageService) snapshotterFromPodSandboxConfig(ctx context.Context, imageRef string,
        s *runtime.PodSandboxConfig) (string, error) {
        snapshotter := c.config.Snapshotter
        if s == nil || s.Annotations == nil {
                return snapshotter, nil
        }

        // TODO(kiashok): honor the new CRI runtime handler field added to v0.29.0
        // for image pull per runtime class support.
        runtimeHandler, ok := s.Annotations[annotations.RuntimeHandler]
        if !ok {
                return snapshotter, nil
        }

        // TODO: Ensure error is returned if runtime not found?
        if c.runtimePlatforms != nil {
                if p, ok := c.runtimePlatforms[runtimeHandler]; ok && p.Snapshotter != snapshotter {
                        snapshotter = p.Snapshotter
                        log.G(ctx).Infof("experimental: PullImage %q for runtime %s, using snapshotter %s", imageRef, runtimeHandler, snapshotter)
                }
        }

        return snapshotter, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "fmt"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/errdefs"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// RemoveImage removes the image.
// TODO(random-liu): Update CRI to pass image reference instead of ImageSpec. (See
// kubernetes/kubernetes#46255)
// TODO(random-liu): We should change CRI to distinguish image id and image spec.
// Remove the whole image no matter the it's image id or reference. This is the
// semantic defined in CRI now.
func (c *GRPCCRIImageService) RemoveImage(ctx context.Context, r *runtime.RemoveImageRequest) (*runtime.RemoveImageResponse, error) {
        err := c.CRIImageService.RemoveImage(ctx, r.GetImage())
        if err != nil && !errdefs.IsNotFound(err) {
                return nil, err
        }
        return &runtime.RemoveImageResponse{}, nil
}

func (c *CRIImageService) RemoveImage(ctx context.Context, imageSpec *runtime.ImageSpec) error {
        span := tracing.SpanFromContext(ctx)

        image, err := c.LocalResolve(imageSpec.GetImage())
        if err != nil {
                if errdefs.IsNotFound(err) {
                        span.AddEvent(err.Error())
                        // return empty without error when image not found.
                        return nil
                }
                return fmt.Errorf("can not resolve %q locally: %w", imageSpec.GetImage(), err)
        }
        span.SetAttributes(tracing.Attribute("image.id", image.ID))
        // Remove all image references.
        for i, ref := range image.References {
                var opts []images.DeleteOpt
                if i == len(image.References)-1 {
                        // Delete the last image reference synchronously to trigger garbage collection.
                        // This is best effort. It is possible that the image reference is deleted by
                        // someone else before this point.
                        opts = []images.DeleteOpt{images.SynchronousDelete()}
                }
                err = c.images.Delete(ctx, ref, opts...)
                if err == nil || errdefs.IsNotFound(err) {
                        // Update image store to reflect the newest state in containerd.
                        if err := c.imageStore.Update(ctx, ref); err != nil {
                                return fmt.Errorf("failed to update image reference %q for %q: %w", ref, image.ID, err)
                        }
                        continue
                }
                return fmt.Errorf("failed to delete image reference %q for %q: %w", ref, image.ID, err)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "encoding/json"
        "fmt"
        "strconv"
        "strings"

        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ImageStatus returns the status of the image, returns nil if the image isn't present.
// TODO(random-liu): We should change CRI to distinguish image id and image spec. (See
// kubernetes/kubernetes#46255)
func (c *CRIImageService) ImageStatus(ctx context.Context, r *runtime.ImageStatusRequest) (*runtime.ImageStatusResponse, error) {
        span := tracing.SpanFromContext(ctx)
        image, err := c.LocalResolve(r.GetImage().GetImage())
        if err != nil {
                if errdefs.IsNotFound(err) {
                        span.AddEvent(err.Error())
                        // return empty without error when image not found.
                        return &runtime.ImageStatusResponse{}, nil
                }
                return nil, fmt.Errorf("can not resolve %q locally: %w", r.GetImage().GetImage(), err)
        }
        span.SetAttributes(tracing.Attribute("image.id", image.ID))
        // TODO(random-liu): [P0] Make sure corresponding snapshot exists. What if snapshot
        // doesn't exist?

        runtimeImage := toCRIImage(image)
        info, err := c.toCRIImageInfo(ctx, &image, r.GetVerbose())
        if err != nil {
                return nil, fmt.Errorf("failed to generate image info: %w", err)
        }

        return &runtime.ImageStatusResponse{
                Image: runtimeImage,
                Info:  info,
        }, nil
}

// toCRIImage converts internal image object to CRI runtime.Image.
func toCRIImage(image imagestore.Image) *runtime.Image {
        repoTags, repoDigests := util.ParseImageReferences(image.References)
        runtimeImage := &runtime.Image{
                Id:          image.ID,
                RepoTags:    repoTags,
                RepoDigests: repoDigests,
                Size_:       uint64(image.Size),
                Pinned:      image.Pinned,
        }
        uid, username := getUserFromImage(image.ImageSpec.Config.User)
        if uid != nil {
                runtimeImage.Uid = &runtime.Int64Value{Value: *uid}
        }
        runtimeImage.Username = username

        return runtimeImage
}

// getUserFromImage gets uid or user name of the image user.
// If user is numeric, it will be treated as uid; or else, it is treated as user name.
func getUserFromImage(user string) (*int64, string) {
        // return both empty if user is not specified in the image.
        if user == "" {
                return nil, ""
        }
        // split instances where the id may contain user:group
        user = strings.Split(user, ":")[0]
        // user could be either uid or user name. Try to interpret as numeric uid.
        uid, err := strconv.ParseInt(user, 10, 64)
        if err != nil {
                // If user is non numeric, assume it's user name.
                return nil, user
        }
        // If user is a numeric uid.
        return &uid, ""
}

// TODO (mikebrow): discuss moving this struct and / or constants for info map for some or all of these fields to CRI
type verboseImageInfo struct {
        ChainID   string          `json:"chainID"`
        ImageSpec imagespec.Image `json:"imageSpec"`
}

// toCRIImageInfo converts internal image object information to CRI image status response info map.
func (c *CRIImageService) toCRIImageInfo(ctx context.Context, image *imagestore.Image, verbose bool) (map[string]string, error) {
        if !verbose {
                return nil, nil
        }

        info := make(map[string]string)

        imi := &verboseImageInfo{
                ChainID:   image.ChainID,
                ImageSpec: image.ImageSpec,
        }

        m, err := json.Marshal(imi)
        if err == nil {
                info["info"] = string(m)
        } else {
                log.G(ctx).WithError(err).Errorf("failed to marshal info %v", imi)
                info["info"] = err.Error()
        }

        return info, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "time"

        "github.com/containerd/containerd/v2/internal/cri/store/snapshot"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ImageFsInfo returns information of the filesystem that is used to store images.
// TODO(windows): Usage for windows is always 0 right now. Support this for windows.
// TODO(random-liu): Handle storage consumed by content store
func (c *CRIImageService) ImageFsInfo(ctx context.Context, r *runtime.ImageFsInfoRequest) (*runtime.ImageFsInfoResponse, error) {
        snapshots := c.snapshotStore.List()
        snapshotterFSInfos := map[string]snapshot.Snapshot{}

        for _, sn := range snapshots {
                if info, ok := snapshotterFSInfos[sn.Key.Snapshotter]; ok {
                        // Use the oldest timestamp as the timestamp of imagefs info.
                        if sn.Timestamp < info.Timestamp {
                                info.Timestamp = sn.Timestamp
                        }
                        info.Size += sn.Size
                        info.Inodes += sn.Inodes
                        snapshotterFSInfos[sn.Key.Snapshotter] = info
                } else {
                        snapshotterFSInfos[sn.Key.Snapshotter] = snapshot.Snapshot{
                                Timestamp: sn.Timestamp,
                                Size:      sn.Size,
                                Inodes:    sn.Inodes,
                        }
                }
        }

        var imageFilesystems []*runtime.FilesystemUsage

        // Currently kubelet always consumes the first entry of the returned array,
        // so put the default snapshotter as the first entry for compatibility.
        if info, ok := snapshotterFSInfos[c.config.Snapshotter]; ok {
                imageFilesystems = append(imageFilesystems, &runtime.FilesystemUsage{
                        Timestamp:  info.Timestamp,
                        FsId:       &runtime.FilesystemIdentifier{Mountpoint: c.imageFSPaths[c.config.Snapshotter]},
                        UsedBytes:  &runtime.UInt64Value{Value: info.Size},
                        InodesUsed: &runtime.UInt64Value{Value: info.Inodes},
                })
                delete(snapshotterFSInfos, c.config.Snapshotter)
        } else {
                imageFilesystems = append(imageFilesystems, &runtime.FilesystemUsage{
                        Timestamp:  time.Now().UnixNano(),
                        FsId:       &runtime.FilesystemIdentifier{Mountpoint: c.imageFSPaths[c.config.Snapshotter]},
                        UsedBytes:  &runtime.UInt64Value{Value: 0},
                        InodesUsed: &runtime.UInt64Value{Value: 0},
                })
        }

        for snapshotter, info := range snapshotterFSInfos {
                imageFilesystems = append(imageFilesystems, &runtime.FilesystemUsage{
                        Timestamp:  info.Timestamp,
                        FsId:       &runtime.FilesystemIdentifier{Mountpoint: c.imageFSPaths[snapshotter]},
                        UsedBytes:  &runtime.UInt64Value{Value: info.Size},
                        InodesUsed: &runtime.UInt64Value{Value: info.Inodes},
                })
        }

        return &runtime.ImageFsInfoResponse{ImageFilesystems: imageFilesystems}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "github.com/docker/go-metrics"
        prom "github.com/prometheus/client_golang/prometheus"
)

var (
        imagePulls           metrics.LabeledCounter
        inProgressImagePulls metrics.Gauge
        // image size in MB / image pull duration in seconds
        imagePullThroughput prom.Histogram
)

func init() {
        const (
                namespace = "containerd"
                subsystem = "cri_sandboxed"
        )

        // these CRI metrics record latencies for successful operations around a sandbox and container's lifecycle.
        ns := metrics.NewNamespace(namespace, subsystem, nil)

        imagePulls = ns.NewLabeledCounter("image_pulls", "succeeded and failed counters", "status")
        inProgressImagePulls = ns.NewGauge("in_progress_image_pulls", "in progress pulls", metrics.Total)
        imagePullThroughput = prom.NewHistogram(
                prom.HistogramOpts{
                        Namespace: namespace,
                        Subsystem: subsystem,
                        Name:      "image_pulling_throughput",
                        Help:      "image pull throughput",
                        Buckets:   prom.DefBuckets,
                },
        )
        ns.Add(imagePullThroughput)
        metrics.Register(ns)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "time"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/snapshots"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        snapshotstore "github.com/containerd/containerd/v2/internal/cri/store/snapshot"
        "github.com/containerd/containerd/v2/internal/kmutex"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        docker "github.com/distribution/reference"
        imagedigest "github.com/opencontainers/go-digest"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

type imageClient interface {
        ListImages(context.Context, ...string) ([]containerd.Image, error)
        GetImage(context.Context, string) (containerd.Image, error)
        Pull(context.Context, string, ...containerd.RemoteOpt) (containerd.Image, error)
}

type ImagePlatform struct {
        Snapshotter string
        Platform    platforms.Platform
}

type CRIImageService struct {
        // config contains all image configurations.
        config criconfig.ImageConfig
        // images is the lower level image store used for raw storage,
        // no event publishing should currently be assumed
        images images.Store
        // client is a subset of the containerd client
        // and will be replaced by image store and transfer service
        client imageClient
        // imageFSPaths contains path to image filesystem for snapshotters.
        imageFSPaths map[string]string
        // runtimePlatforms are the platforms configured for a runtime.
        runtimePlatforms map[string]ImagePlatform
        // imageStore stores all resources associated with images.
        imageStore *imagestore.Store
        // snapshotStore stores information of all snapshots.
        snapshotStore *snapshotstore.Store
        // unpackDuplicationSuppressor is used to make sure that there is only
        // one in-flight fetch request or unpack handler for a given descriptor's
        // or chain ID.
        unpackDuplicationSuppressor kmutex.KeyedLocker
}

type GRPCCRIImageService struct {
        *CRIImageService
}

type CRIImageServiceOptions struct {
        Content content.Store

        Images images.Store

        ImageFSPaths map[string]string

        RuntimePlatforms map[string]ImagePlatform

        Snapshotters map[string]snapshots.Snapshotter

        Client imageClient
}

// NewService creates a new CRI Image Service
//
// TODO:
//  1. Generalize the image service and merge with a single higher level image service
//  2. Update the options to remove client and imageFSPath
//     - Platform configuration with Array/Map of snapshotter names + filesystem ID + platform matcher + runtime to snapshotter
//     - Transfer service implementation
//     - Image Service (from metadata)
//     - Content store (from metadata)
//  3. Separate image cache and snapshot cache to first class plugins, make the snapshot cache much more efficient and intelligent
func NewService(config criconfig.ImageConfig, options *CRIImageServiceOptions) (*CRIImageService, error) {
        svc := CRIImageService{
                config:                      config,
                images:                      options.Images,
                client:                      options.Client,
                imageStore:                  imagestore.NewStore(options.Images, options.Content, platforms.Default()),
                imageFSPaths:                options.ImageFSPaths,
                runtimePlatforms:            options.RuntimePlatforms,
                snapshotStore:               snapshotstore.NewStore(),
                unpackDuplicationSuppressor: kmutex.New(),
        }

        log.L.Info("Start snapshots syncer")
        snapshotsSyncer := newSnapshotsSyncer(
                svc.snapshotStore,
                options.Snapshotters,
                time.Duration(svc.config.StatsCollectPeriod)*time.Second,
        )
        snapshotsSyncer.start()

        return &svc, nil
}

// LocalResolve resolves image reference locally and returns corresponding image metadata. It
// returns errdefs.ErrNotFound if the reference doesn't exist.
func (c *CRIImageService) LocalResolve(refOrID string) (imagestore.Image, error) {
        getImageID := func(refOrId string) string {
                if _, err := imagedigest.Parse(refOrID); err == nil {
                        return refOrID
                }
                return func(ref string) string {
                        // ref is not image id, try to resolve it locally.
                        // TODO(random-liu): Handle this error better for debugging.
                        normalized, err := docker.ParseDockerRef(ref)
                        if err != nil {
                                return ""
                        }
                        id, err := c.imageStore.Resolve(normalized.String())
                        if err != nil {
                                return ""
                        }
                        return id
                }(refOrID)
        }

        imageID := getImageID(refOrID)
        if imageID == "" {
                // Try to treat ref as imageID
                imageID = refOrID
        }
        return c.imageStore.Get(imageID)
}

// RuntimeSnapshotter overrides the default snapshotter if Snapshotter is set for this runtime.
// See https://github.com/containerd/containerd/issues/6657
// TODO: Pass in name and get back runtime platform
func (c *CRIImageService) RuntimeSnapshotter(ctx context.Context, ociRuntime criconfig.Runtime) string {
        if ociRuntime.Snapshotter == "" {
                return c.config.Snapshotter
        }

        log.G(ctx).Debugf("Set snapshotter for runtime %s to %s", ociRuntime.Type, ociRuntime.Snapshotter)
        return ociRuntime.Snapshotter
}

// GetImage gets image metadata by image id.
func (c *CRIImageService) GetImage(id string) (imagestore.Image, error) {
        return c.imageStore.Get(id)
}

// GetSnapshot returns the snapshot with specified key.
func (c *CRIImageService) GetSnapshot(key, snapshotter string) (snapshotstore.Snapshot, error) {
        snapshotKey := snapshotstore.Key{
                Key:         key,
                Snapshotter: snapshotter,
        }
        return c.snapshotStore.Get(snapshotKey)
}

func (c *CRIImageService) ImageFSPaths() map[string]string {
        return c.imageFSPaths
}

// PinnedImage is used to lookup a pinned image by name.
// Most often used to get the "sandbox" image.
func (c *CRIImageService) PinnedImage(name string) string {
        return c.config.PinnedImages[name]
}

// GRPCService returns a new CRI Image Service grpc server.
func (c *CRIImageService) GRPCService() runtime.ImageServiceServer {
        return &GRPCCRIImageService{c}
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "fmt"
        "time"

        snapshot "github.com/containerd/containerd/v2/core/snapshots"
        snapshotstore "github.com/containerd/containerd/v2/internal/cri/store/snapshot"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

// snapshotsSyncer syncs snapshot stats periodically. imagefs info and container stats
// should both use cached result here.
// TODO(random-liu): Benchmark with high workload. We may need a statsSyncer instead if
// benchmark result shows that container cpu/memory stats also need to be cached.
type snapshotsSyncer struct {
        store        *snapshotstore.Store
        snapshotters map[string]snapshot.Snapshotter
        syncPeriod   time.Duration
}

// newSnapshotsSyncer creates a snapshot syncer.
func newSnapshotsSyncer(store *snapshotstore.Store, snapshotters map[string]snapshot.Snapshotter,
        period time.Duration) *snapshotsSyncer {
        return &snapshotsSyncer{
                store:        store,
                snapshotters: snapshotters,
                syncPeriod:   period,
        }
}

// start starts the snapshots syncer. No stop function is needed because
// the syncer doesn't update any persistent states, it's fine to let it
// exit with the process.
func (s *snapshotsSyncer) start() {
        tick := time.NewTicker(s.syncPeriod)
        go func() {
                defer tick.Stop()
                // TODO(random-liu): This is expensive. We should do benchmark to
                // check the resource usage and optimize this.
                for {
                        if err := s.sync(); err != nil {
                                log.L.WithError(err).Error("Failed to sync snapshot stats")
                        }
                        <-tick.C
                }
        }()
}

// sync updates all snapshots stats.
func (s *snapshotsSyncer) sync() error {
        ctx := ctrdutil.NamespacedContext()
        start := time.Now().UnixNano()

        for key, snapshotter := range s.snapshotters {
                var snapshots []snapshot.Info
                // Do not call `Usage` directly in collect function, because
                // `Usage` takes time, we don't want `Walk` to hold read lock
                // of snapshot metadata store for too long time.
                // TODO(random-liu): Set timeout for the following 2 contexts.
                if err := snapshotter.Walk(ctx, func(ctx context.Context, info snapshot.Info) error {
                        snapshots = append(snapshots, info)
                        return nil
                }); err != nil {
                        return fmt.Errorf("walk all snapshots for %q failed: %w", key, err)
                }
                for _, info := range snapshots {
                        snapshotKey := snapshotstore.Key{
                                Key:         info.Name,
                                Snapshotter: key,
                        }
                        sn, err := s.store.Get(snapshotKey)
                        if err == nil {
                                // Only update timestamp for non-active snapshot.
                                if sn.Kind == info.Kind && sn.Kind != snapshot.KindActive {
                                        sn.Timestamp = time.Now().UnixNano()
                                        s.store.Add(sn)
                                        continue
                                }
                        }
                        // Get newest stats if the snapshot is new or active.
                        sn = snapshotstore.Snapshot{
                                Key: snapshotstore.Key{
                                        Key:         info.Name,
                                        Snapshotter: key,
                                },
                                Kind:      info.Kind,
                                Timestamp: time.Now().UnixNano(),
                        }
                        usage, err := snapshotter.Usage(ctx, info.Name)
                        if err != nil {
                                if !errdefs.IsNotFound(err) {
                                        log.L.WithError(err).Errorf("Failed to get usage for snapshot %q", info.Name)
                                }
                                continue
                        }
                        sn.Size = uint64(usage.Size)
                        sn.Inodes = uint64(usage.Inodes)
                        s.store.Add(sn)
                }
        }

        for _, sn := range s.store.List() {
                if sn.Timestamp >= start {
                        continue
                }
                // Delete the snapshot stats if it's not updated this time.
                s.store.Delete(sn.Key)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"

        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (c *criService) ListMetricDescriptors(context.Context, *runtime.ListMetricDescriptorsRequest) (*runtime.ListMetricDescriptorsResponse, error) {
        return nil, status.Errorf(codes.Unimplemented, "method ListMetricDescriptors not implemented")
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"

        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (c *criService) ListPodSandboxMetrics(context.Context, *runtime.ListPodSandboxMetricsRequest) (*runtime.ListPodSandboxMetricsResponse, error) {
        return nil, status.Errorf(codes.Unimplemented, "method ListPodSandboxMetrics not implemented")
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "github.com/docker/go-metrics"
)

var (
        sandboxListTimer          metrics.Timer
        sandboxCreateNetworkTimer metrics.Timer
        sandboxDeleteNetwork      metrics.Timer

        sandboxRuntimeCreateTimer metrics.LabeledTimer
        sandboxRuntimeStopTimer   metrics.LabeledTimer
        sandboxRemoveTimer        metrics.LabeledTimer

        containerListTimer          metrics.Timer
        containerRemoveTimer        metrics.LabeledTimer
        containerCreateTimer        metrics.LabeledTimer
        containerStopTimer          metrics.LabeledTimer
        containerStartTimer         metrics.LabeledTimer
        containerEventsDroppedCount metrics.Counter
        containerCheckpointTimer    metrics.LabeledTimer

        networkPluginOperations        metrics.LabeledCounter
        networkPluginOperationsErrors  metrics.LabeledCounter
        networkPluginOperationsLatency metrics.LabeledTimer
)

func init() {
        // these CRI metrics record latencies for successful operations around a sandbox and container's lifecycle.
        ns := metrics.NewNamespace("containerd", "cri_sandboxed", nil)

        sandboxListTimer = ns.NewTimer("sandbox_list", "time to list sandboxes")
        sandboxCreateNetworkTimer = ns.NewTimer("sandbox_create_network", "time to create the network for a sandbox")
        sandboxDeleteNetwork = ns.NewTimer("sandbox_delete_network", "time to delete a sandbox's network")

        sandboxRuntimeCreateTimer = ns.NewLabeledTimer("sandbox_runtime_create", "time to create a sandbox in the runtime", "runtime")
        sandboxRuntimeStopTimer = ns.NewLabeledTimer("sandbox_runtime_stop", "time to stop a sandbox", "runtime")
        sandboxRemoveTimer = ns.NewLabeledTimer("sandbox_remove", "time to remove a sandbox", "runtime")

        containerListTimer = ns.NewTimer("container_list", "time to list containers")
        containerRemoveTimer = ns.NewLabeledTimer("container_remove", "time to remove a container", "runtime")
        containerCreateTimer = ns.NewLabeledTimer("container_create", "time to create a container", "runtime")
        containerStopTimer = ns.NewLabeledTimer("container_stop", "time to stop a container", "runtime")
        containerStartTimer = ns.NewLabeledTimer("container_start", "time to start a container", "runtime")
        containerEventsDroppedCount = ns.NewCounter("container_events_dropped", "count container discarding event total from server start")
        containerCheckpointTimer = ns.NewLabeledTimer("container_checkpoint", "time to checkpoint a container", "runtime")

        networkPluginOperations = ns.NewLabeledCounter("network_plugin_operations_total", "cumulative number of network plugin operations by operation type", "operation_type")
        networkPluginOperationsErrors = ns.NewLabeledCounter("network_plugin_operations_errors_total", "cumulative number of network plugin operations by operation type", "operation_type")
        networkPluginOperationsLatency = ns.NewLabeledTimer("network_plugin_operations_duration_seconds", "latency in seconds of network plugin operations. Broken down by operation type", "operation_type")

        metrics.Register(ns)
}

// for backwards compatibility with kubelet/dockershim metrics
// https://github.com/containerd/containerd/issues/7801
const (
        networkStatusOp   = "get_pod_network_status"
        networkSetUpOp    = "set_up_pod"
        networkTearDownOp = "tear_down_pod"
)

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        cstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        sstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
)

type criImplementation struct {
        c *criService
}

func (i *criImplementation) Config() *criconfig.Config {
        return &i.c.config
}

func (i *criImplementation) SandboxStore() *sstore.Store {
        return i.c.sandboxStore
}

func (i *criImplementation) ContainerStore() *cstore.Store {
        return i.c.containerStore
}

func (i *criImplementation) ContainerMetadataExtensionKey() string {
        return crilabels.ContainerMetadataExtension
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "time"

        cstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        cri "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (i *criImplementation) UpdateContainerResources(ctx context.Context, ctr cstore.Container, req *cri.UpdateContainerResourcesRequest, status cstore.Status) (cstore.Status, error) {
        return i.c.updateContainerResources(ctx, ctr, req, status)
}

func (i *criImplementation) StopContainer(ctx context.Context, ctr cstore.Container, timeout time.Duration) error {
        return i.c.stopContainer(ctx, ctr, timeout)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// TODO: these are copied from container_create_linux.go and should be consolidated later.

package podsandbox

import (
        "errors"
        "fmt"
        "strconv"
        "strings"

        "github.com/containerd/containerd/v2/contrib/seccomp"
        "github.com/containerd/containerd/v2/pkg/oci"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

const (
        // profileNamePrefix is the prefix for loading profiles on a localhost. Eg. AppArmor localhost/profileName.
        profileNamePrefix = "localhost/" // TODO (mikebrow): get localhost/ & runtime/default from CRI kubernetes/kubernetes#51747
        // runtimeDefault indicates that we should use or create a runtime default profile.
        runtimeDefault = "runtime/default"
        // dockerDefault indicates that we should use or create a docker default profile.
        dockerDefault = "docker/default"
        // unconfinedProfile is a string indicating one should run a pod/containerd without a security profile
        unconfinedProfile = "unconfined"
)

// generateSeccompSpecOpts generates containerd SpecOpts for seccomp.
func (c *Controller) generateSeccompSpecOpts(sp *runtime.SecurityProfile, privileged, seccompEnabled bool) (oci.SpecOpts, error) {
        if privileged {
                // Do not set seccomp profile when container is privileged
                return nil, nil
        }
        if !seccompEnabled {
                if sp != nil {
                        if sp.ProfileType != runtime.SecurityProfile_Unconfined {
                                return nil, errors.New("seccomp is not supported")
                        }
                }
                return nil, nil
        }

        if sp == nil {
                return nil, nil
        }

        if sp.ProfileType != runtime.SecurityProfile_Localhost && sp.LocalhostRef != "" {
                return nil, errors.New("seccomp config invalid LocalhostRef must only be set if ProfileType is Localhost")
        }
        switch sp.ProfileType {
        case runtime.SecurityProfile_Unconfined:
                // Do not set seccomp profile.
                return nil, nil
        case runtime.SecurityProfile_RuntimeDefault:
                return seccomp.WithDefaultProfile(), nil
        case runtime.SecurityProfile_Localhost:
                // trimming the localhost/ prefix just in case even though it should not
                // be necessary with the new SecurityProfile struct
                return seccomp.WithProfile(strings.TrimPrefix(sp.LocalhostRef, profileNamePrefix)), nil
        default:
                return nil, errors.New("seccomp unknown ProfileType")
        }
}

func generateSeccompSecurityProfile(profilePath string, unsetProfilePath string) (*runtime.SecurityProfile, error) {
        if profilePath != "" {
                return generateSecurityProfile(profilePath)
        }
        if unsetProfilePath != "" {
                return generateSecurityProfile(unsetProfilePath)
        }
        return nil, nil
}

func generateSecurityProfile(profilePath string) (*runtime.SecurityProfile, error) {
        switch profilePath {
        case runtimeDefault, dockerDefault, "":
                return &runtime.SecurityProfile{
                        ProfileType: runtime.SecurityProfile_RuntimeDefault,
                }, nil
        case unconfinedProfile:
                return &runtime.SecurityProfile{
                        ProfileType: runtime.SecurityProfile_Unconfined,
                }, nil
        default:
                // Require and Trim default profile name prefix
                if !strings.HasPrefix(profilePath, profileNamePrefix) {
                        return nil, fmt.Errorf("invalid profile %q", profilePath)
                }
                return &runtime.SecurityProfile{
                        ProfileType:  runtime.SecurityProfile_Localhost,
                        LocalhostRef: strings.TrimPrefix(profilePath, profileNamePrefix),
                }, nil
        }
}

// generateUserString generates valid user string based on OCI Image Spec
// v1.0.0.
//
// CRI defines that the following combinations are valid:
//
// (none) -> ""
// username -> username
// username, uid -> username
// username, uid, gid -> username:gid
// username, gid -> username:gid
// uid -> uid
// uid, gid -> uid:gid
// gid -> error
//
// TODO(random-liu): Add group name support in CRI.
func generateUserString(username string, uid, gid *runtime.Int64Value) (string, error) {
        var userstr, groupstr string
        if uid != nil {
                userstr = strconv.FormatInt(uid.GetValue(), 10)
        }
        if username != "" {
                userstr = username
        }
        if gid != nil {
                groupstr = strconv.FormatInt(gid.GetValue(), 10)
        }
        if userstr == "" {
                if groupstr != "" {
                        return "", fmt.Errorf("user group %q is specified without user", groupstr)
                }
                return "", nil
        }
        if groupstr != "" {
                userstr = userstr + ":" + groupstr
        }
        return userstr, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        eventtypes "github.com/containerd/containerd/api/events"
        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/sandbox"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/constants"
        "github.com/containerd/containerd/v2/internal/cri/server/events"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox/types"
        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/oci"
        osinterface "github.com/containerd/containerd/v2/pkg/os"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.SandboxControllerPlugin,
                ID:   "podsandbox",
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                        plugins.LeasePlugin,
                        plugins.SandboxStorePlugin,
                        plugins.CRIServicePlugin,
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        client, err := containerd.New(
                                "",
                                containerd.WithDefaultNamespace(constants.K8sContainerdNamespace),
                                containerd.WithDefaultPlatform(platforms.Default()),
                                containerd.WithInMemoryServices(ic),
                        )
                        if err != nil {
                                return nil, fmt.Errorf("unable to init client for podsandbox: %w", err)
                        }

                        // Get runtime service.
                        criRuntimePlugin, err := ic.GetByID(plugins.CRIServicePlugin, "runtime")
                        if err != nil {
                                return nil, fmt.Errorf("unable to load CRI runtime service plugin dependency: %w", err)
                        }
                        runtimeService := criRuntimePlugin.(RuntimeService)

                        // Get image service.
                        criImagePlugin, err := ic.GetByID(plugins.CRIServicePlugin, "images")
                        if err != nil {
                                return nil, fmt.Errorf("unable to load CRI image service plugin dependency: %w", err)
                        }

                        c := Controller{
                                client:         client,
                                config:         runtimeService.Config(),
                                os:             osinterface.RealOS{},
                                runtimeService: runtimeService,
                                imageService:   criImagePlugin.(ImageService),
                                store:          NewStore(),
                        }

                        eventMonitor := events.NewEventMonitor(&podSandboxEventHandler{
                                controller: &c,
                        })
                        eventMonitor.Subscribe(client, []string{`topic=="/tasks/exit"`})
                        eventMonitor.Start()
                        c.eventMonitor = eventMonitor

                        return &c, nil
                },
        })
}

// RuntimeService specifies dependencies to CRI runtime service.
type RuntimeService interface {
        Config() criconfig.Config
        LoadOCISpec(string) (*oci.Spec, error)
}

// ImageService specifies dependencies to CRI image service.
type ImageService interface {
        LocalResolve(refOrID string) (imagestore.Image, error)
        GetImage(id string) (imagestore.Image, error)
        PullImage(ctx context.Context, name string, creds func(string) (string, string, error), sc *runtime.PodSandboxConfig, runtimeHandler string) (string, error)
        RuntimeSnapshotter(ctx context.Context, ociRuntime criconfig.Runtime) string
        PinnedImage(string) string
}

type Controller struct {
        // config contains all configurations.
        config criconfig.Config
        // client is an instance of the containerd client
        client *containerd.Client
        // runtimeService is a dependency to CRI runtime service.
        runtimeService RuntimeService
        // imageService is a dependency to CRI image service.
        imageService ImageService
        // os is an interface for all required os operations.
        os osinterface.OS
        // eventMonitor is the event monitor for podsandbox controller to handle sandbox task exit event
        // actually we only use it's backoff mechanism to make sure pause container is cleaned up.
        eventMonitor *events.EventMonitor

        store *Store
}

var _ sandbox.Controller = (*Controller)(nil)

func (c *Controller) Platform(_ctx context.Context, _sandboxID string) (platforms.Platform, error) {
        return platforms.DefaultSpec(), nil
}

func (c *Controller) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) {
        podSandbox := c.store.Get(sandboxID)
        if podSandbox == nil {
                return sandbox.ExitStatus{}, fmt.Errorf("failed to get exit channel. %q", sandboxID)

        }
        exit, err := podSandbox.Wait(ctx)
        if err != nil {
                return sandbox.ExitStatus{}, fmt.Errorf("failed to wait pod sandbox, %w", err)
        }
        return sandbox.ExitStatus{
                ExitStatus: exit.ExitCode(),
                ExitedAt:   exit.ExitTime(),
        }, err

}

func (c *Controller) waitSandboxExit(ctx context.Context, p *types.PodSandbox, exitCh <-chan containerd.ExitStatus) error {
        select {
        case e := <-exitCh:
                exitStatus, exitedAt, err := e.Result()
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("failed to get task exit status for %q", p.ID)
                        exitStatus = unknownExitCode
                        exitedAt = time.Now()
                }
                dctx := ctrdutil.NamespacedContext()
                dctx, dcancel := context.WithTimeout(dctx, handleEventTimeout)
                defer dcancel()
                event := &eventtypes.TaskExit{ExitStatus: exitStatus, ExitedAt: protobuf.ToTimestamp(exitedAt)}
                if err := handleSandboxTaskExit(dctx, p, event); err != nil {
                        c.eventMonitor.Backoff(p.ID, event)
                }
                return nil
        case <-ctx.Done():
                return ctx.Err()
        }
}

// handleSandboxTaskExit handles TaskExit event for sandbox.
func handleSandboxTaskExit(ctx context.Context, sb *types.PodSandbox, e *eventtypes.TaskExit) error {
        // No stream attached to sandbox container.
        task, err := sb.Container.Task(ctx, nil)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to load task for sandbox: %w", err)
                }
        } else {
                // TODO(random-liu): [P1] This may block the loop, we may want to spawn a worker
                if _, err = task.Delete(ctx, WithNRISandboxDelete(sb.ID), containerd.WithProcessKill); err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to stop sandbox: %w", err)
                        }
                }
        }
        if err := sb.Exit(e.ExitStatus, protobuf.FromTimestamp(e.ExitedAt)); err != nil {
                return err
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/log"

        eventtypes "github.com/containerd/containerd/api/events"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
)

const (
        // handleEventTimeout is the timeout for handling 1 event. Event monitor
        // handles events in serial, if one event blocks the event monitor, no
        // other events can be handled.
        // Add a timeout for each event handling, events that timeout will be requeued and
        // handled again in the future.
        handleEventTimeout = 10 * time.Second
)

type podSandboxEventHandler struct {
        controller *Controller
}

func (p *podSandboxEventHandler) HandleEvent(any interface{}) error {
        switch e := any.(type) {
        case *eventtypes.TaskExit:
                log.L.Infof("TaskExit event in podsandbox handler %+v", e)
                // Use ID instead of ContainerID to rule out TaskExit event for exec.
                sb := p.controller.store.Get(e.ID)
                if sb == nil {
                        return nil
                }
                ctx := ctrdutil.NamespacedContext()
                ctx, cancel := context.WithTimeout(ctx, handleEventTimeout)
                defer cancel()
                if err := handleSandboxTaskExit(ctx, sb, e); err != nil {
                        return fmt.Errorf("failed to handle container TaskExit event: %w", err)
                }
                return nil
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"
        "path"
        "path/filepath"

        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        docker "github.com/distribution/reference"
        imagedigest "github.com/opencontainers/go-digest"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        clabels "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/containerd/v2/pkg/oci"
)

const (

        // sandboxesDir contains all sandbox root. A sandbox root is the running
        // directory of the sandbox, all files created for the sandbox will be
        // placed under this directory.
        sandboxesDir = "sandboxes"
        // MetadataKey is the key used for storing metadata in the sandbox extensions
        MetadataKey = "metadata"
)

const (
        // unknownExitCode is the exit code when exit reason is unknown.
        unknownExitCode = 255
)

// getSandboxRootDir returns the root directory for managing sandbox files,
// e.g. hosts files.
func (c *Controller) getSandboxRootDir(id string) string {
        return filepath.Join(c.config.RootDir, sandboxesDir, id)
}

// getVolatileSandboxRootDir returns the root directory for managing volatile sandbox files,
// e.g. named pipes.
func (c *Controller) getVolatileSandboxRootDir(id string) string {
        return filepath.Join(c.config.StateDir, sandboxesDir, id)
}

// getRepoDigestAngTag returns image repoDigest and repoTag of the named image reference.
func getRepoDigestAndTag(namedRef docker.Named, digest imagedigest.Digest, schema1 bool) (string, string) {
        var repoTag, repoDigest string
        if _, ok := namedRef.(docker.NamedTagged); ok {
                repoTag = namedRef.String()
        }
        if _, ok := namedRef.(docker.Canonical); ok {
                repoDigest = namedRef.String()
        } else if !schema1 {
                // digest is not actual repo digest for schema1 image.
                repoDigest = namedRef.Name() + "@" + digest.String()
        }
        return repoDigest, repoTag
}

// toContainerdImage converts an image object in image store to containerd image handler.
func (c *Controller) toContainerdImage(ctx context.Context, image imagestore.Image) (containerd.Image, error) {
        // image should always have at least one reference.
        if len(image.References) == 0 {
                return nil, fmt.Errorf("invalid image with no reference %q", image.ID)
        }
        return c.client.GetImage(ctx, image.References[0])
}

// buildLabel builds the labels from config to be passed to containerd
func buildLabels(configLabels, imageConfigLabels map[string]string, containerType string) map[string]string {
        labels := make(map[string]string)

        for k, v := range imageConfigLabels {
                if err := clabels.Validate(k, v); err == nil {
                        labels[k] = v
                } else {
                        // In case the image label is invalid, we output a warning and skip adding it to the
                        // container.
                        log.L.WithError(err).Warnf("unable to add image label with key %s to the container", k)
                }
        }
        // labels from the CRI request (config) will override labels in the image config
        for k, v := range configLabels {
                labels[k] = v
        }
        labels[crilabels.ContainerKindLabel] = containerType
        return labels
}

// parseImageReferences parses a list of arbitrary image references and returns
// the repotags and repodigests
func parseImageReferences(refs []string) ([]string, []string) {
        var tags, digests []string
        for _, ref := range refs {
                parsed, err := docker.ParseAnyReference(ref)
                if err != nil {
                        continue
                }
                if _, ok := parsed.(docker.Canonical); ok {
                        digests = append(digests, parsed.String())
                } else if _, ok := parsed.(docker.Tagged); ok {
                        tags = append(tags, parsed.String())
                }
        }
        return tags, digests
}

// getPassthroughAnnotations filters requested pod annotations by comparing
// against permitted annotations for the given runtime.
func getPassthroughAnnotations(podAnnotations map[string]string,
        runtimePodAnnotations []string) (passthroughAnnotations map[string]string) {
        passthroughAnnotations = make(map[string]string)

        for podAnnotationKey, podAnnotationValue := range podAnnotations {
                for _, pattern := range runtimePodAnnotations {
                        // Use path.Match instead of filepath.Match here.
                        // filepath.Match treated `\\` as path separator
                        // on windows, which is not what we want.
                        if ok, _ := path.Match(pattern, podAnnotationKey); ok {
                                passthroughAnnotations[podAnnotationKey] = podAnnotationValue
                        }
                }
        }
        return passthroughAnnotations
}

// runtimeSpec returns a default runtime spec used in cri-containerd.
func (c *Controller) runtimeSpec(id string, baseSpecFile string, opts ...oci.SpecOpts) (*runtimespec.Spec, error) {
        // GenerateSpec needs namespace.
        ctx := ctrdutil.NamespacedContext()
        container := &containers.Container{ID: id}

        if baseSpecFile != "" {
                baseSpec, err := c.runtimeService.LoadOCISpec(baseSpecFile)
                if err != nil {
                        return nil, fmt.Errorf("can't load base OCI spec %q: %w", baseSpecFile, err)
                }

                spec := oci.Spec{}
                if err := ctrdutil.DeepCopy(&spec, &baseSpec); err != nil {
                        return nil, fmt.Errorf("failed to clone OCI spec: %w", err)
                }

                // Fix up cgroups path
                applyOpts := append([]oci.SpecOpts{oci.WithNamespacedCgroup()}, opts...)

                if err := oci.ApplyOpts(ctx, nil, container, &spec, applyOpts...); err != nil {
                        return nil, fmt.Errorf("failed to apply OCI options: %w", err)
                }

                return &spec, nil
        }

        spec, err := oci.GenerateSpec(ctx, nil, container, opts...)
        if err != nil {
                return nil, fmt.Errorf("failed to generate spec: %w", err)
        }

        return spec, nil
}

func getMetadata(ctx context.Context, container containerd.Container) (*sandboxstore.Metadata, error) {
        // Load sandbox metadata.
        exts, err := container.Extensions(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to get sandbox container extensions: %w", err)
        }
        ext, ok := exts[crilabels.SandboxMetadataExtension]
        if !ok {
                return nil, fmt.Errorf("metadata extension %q not found", crilabels.SandboxMetadataExtension)
        }
        data, err := typeurl.UnmarshalAny(ext)
        if err != nil {
                return nil, fmt.Errorf("failed to unmarshal metadata extension %q: %w", ext, err)
        }
        meta, ok := data.(*sandboxstore.Metadata)
        if !ok {
                return nil, fmt.Errorf("failed to convert the extension to sandbox metadata")
        }
        return meta, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"
        "os"
        "path"
        "path/filepath"
        "regexp"
        "sort"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/log"
        "github.com/moby/sys/mountinfo"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/selinux/go-selinux/label"
        "golang.org/x/sys/unix"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/cri/seutil"
        "github.com/containerd/containerd/v2/pkg/seccomp"
)

const (
        // defaultSandboxOOMAdj is default omm adj for sandbox container. (kubernetes#47938).
        defaultSandboxOOMAdj = -998
        // defaultShmSize is the default size of the sandbox shm.
        defaultShmSize = int64(1024 * 1024 * 64)
        // relativeRootfsPath is the rootfs path relative to bundle path.
        relativeRootfsPath = "rootfs"
        // devShm is the default path of /dev/shm.
        devShm = "/dev/shm"
        // etcHosts is the default path of /etc/hosts file.
        etcHosts = "/etc/hosts"
        // resolvConfPath is the abs path of resolv.conf on host or container.
        resolvConfPath = "/etc/resolv.conf"
)

// getCgroupsPath generates container cgroups path.
func getCgroupsPath(cgroupsParent, id string) string {
        base := path.Base(cgroupsParent)
        if strings.HasSuffix(base, ".slice") {
                // For a.slice/b.slice/c.slice, base is c.slice.
                // runc systemd cgroup path format is "slice:prefix:name".
                return strings.Join([]string{base, "cri-containerd", id}, ":")
        }
        return filepath.Join(cgroupsParent, id)
}

// getSandboxHostname returns the hostname file path inside the sandbox root directory.
func (c *Controller) getSandboxHostname(id string) string {
        return filepath.Join(c.getSandboxRootDir(id), "hostname")
}

// getSandboxHosts returns the hosts file path inside the sandbox root directory.
func (c *Controller) getSandboxHosts(id string) string {
        return filepath.Join(c.getSandboxRootDir(id), "hosts")
}

// getResolvPath returns resolv.conf filepath for specified sandbox.
func (c *Controller) getResolvPath(id string) string {
        return filepath.Join(c.getSandboxRootDir(id), "resolv.conf")
}

// getSandboxDevShm returns the shm file path inside the sandbox root directory.
func (c *Controller) getSandboxDevShm(id string) string {
        return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
}

func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
        var labels []string

        if selinuxOptions == nil {
                return nil, nil
        }
        if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
                return nil, err
        }
        if selinuxOptions.User != "" {
                labels = append(labels, "user:"+selinuxOptions.User)
        }
        if selinuxOptions.Role != "" {
                labels = append(labels, "role:"+selinuxOptions.Role)
        }
        if selinuxOptions.Type != "" {
                labels = append(labels, "type:"+selinuxOptions.Type)
        }
        if selinuxOptions.Level != "" {
                labels = append(labels, "level:"+selinuxOptions.Level)
        }

        return labels, nil
}

func initLabelsFromOpt(selinuxOpts *runtime.SELinuxOption) (string, string, error) {
        labels, err := toLabel(selinuxOpts)
        if err != nil {
                return "", "", err
        }
        return label.InitLabels(labels)
}

func checkSelinuxLevel(level string) error {
        if len(level) == 0 {
                return nil
        }

        matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
        if err != nil {
                return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
        }
        if !matched {
                return fmt.Errorf("the format of 'level' %q is not correct", level)
        }
        return nil
}

func (c *Controller) seccompEnabled() bool {
        return seccomp.IsEnabled()
}

// unmountRecursive unmounts the target and all mounts underneath, starting with
// the deepest mount first.
func unmountRecursive(ctx context.Context, target string) error {
        target, err := mount.CanonicalizePath(target)
        if err != nil {
                return err
        }

        toUnmount, err := mountinfo.GetMounts(mountinfo.PrefixFilter(target))
        if err != nil {
                return err
        }

        // Make the deepest mount be first
        sort.Slice(toUnmount, func(i, j int) bool {
                return len(toUnmount[i].Mountpoint) > len(toUnmount[j].Mountpoint)
        })

        for i, m := range toUnmount {
                if err := mount.UnmountAll(m.Mountpoint, unix.MNT_DETACH); err != nil {
                        if i == len(toUnmount)-1 { // last mount
                                return err
                        }
                        // This is some submount, we can ignore this error for now, the final unmount will fail if this is a real problem
                        log.G(ctx).WithError(err).Debugf("failed to unmount submount %s", m.Mountpoint)
                }
        }
        return nil
}

// ensureRemoveAll wraps `os.RemoveAll` to check for specific errors that can
// often be remedied.
// Only use `ensureRemoveAll` if you really want to make every effort to remove
// a directory.
//
// Because of the way `os.Remove` (and by extension `os.RemoveAll`) works, there
// can be a race between reading directory entries and then actually attempting
// to remove everything in the directory.
// These types of errors do not need to be returned since it's ok for the dir to
// be gone we can just retry the remove operation.
//
// This should not return a `os.ErrNotExist` kind of error under any circumstances
func ensureRemoveAll(ctx context.Context, dir string) error {
        notExistErr := make(map[string]bool)

        // track retries
        exitOnErr := make(map[string]int)
        maxRetry := 50

        // Attempt to unmount anything beneath this dir first.
        if err := unmountRecursive(ctx, dir); err != nil {
                log.G(ctx).WithError(err).Debugf("failed to do initial unmount of %s", dir)
        }

        for {
                err := os.RemoveAll(dir)
                if err == nil {
                        return nil
                }

                pe, ok := err.(*os.PathError)
                if !ok {
                        return err
                }

                if os.IsNotExist(err) {
                        if notExistErr[pe.Path] {
                                return err
                        }
                        notExistErr[pe.Path] = true

                        // There is a race where some subdir can be removed but after the
                        // parent dir entries have been read.
                        // So the path could be from `os.Remove(subdir)`
                        // If the reported non-existent path is not the passed in `dir` we
                        // should just retry, but otherwise return with no error.
                        if pe.Path == dir {
                                return nil
                        }
                        continue
                }

                if pe.Err != syscall.EBUSY {
                        return err
                }
                if e := mount.Unmount(pe.Path, unix.MNT_DETACH); e != nil {
                        return fmt.Errorf("error while removing %s: %w", dir, e)
                }

                if exitOnErr[pe.Path] == maxRetry {
                        return err
                }
                exitOnErr[pe.Path]++
                time.Sleep(100 * time.Millisecond)
        }
}

var vmbasedRuntimes = []string{
        "io.containerd.kata",
}

func isVMBasedRuntime(runtimeType string) bool {
        for _, rt := range vmbasedRuntimes {
                if strings.Contains(runtimeType, rt) {
                        return true
                }
        }
        return false
}

func modifyProcessLabel(runtimeType string, spec *runtimespec.Spec) error {
        if !isVMBasedRuntime(runtimeType) {
                return nil
        }
        l, err := seutil.ChangeToKVM(spec.Process.SelinuxLabel)
        if err != nil {
                return fmt.Errorf("failed to get selinux kvm label: %w", err)
        }
        spec.Process.SelinuxLabel = l
        return nil
}

func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
        var m []runtimespec.LinuxIDMapping

        if len(runtimeIDMap) == 0 {
                return m, nil
        }

        if len(runtimeIDMap) > 1 {
                // We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
                return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
        }

        // We know len is 1 now.
        if runtimeIDMap[0] == nil {
                return m, nil
        }
        uidMap := *runtimeIDMap[0]

        if uidMap.Length < 1 {
                return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
        }

        m = []runtimespec.LinuxIDMapping{
                {
                        ContainerID: uidMap.ContainerId,
                        HostID:      uidMap.HostId,
                        Size:        uidMap.Length,
                },
        }

        return m, nil
}

func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
        if userns == nil {
                // If userns is not set, the kubelet doesn't support this option
                // and we should just fallback to no userns. This is completely
                // valid.
                return nil, nil, nil
        }

        uids, err := parseUsernsIDMap(userns.GetUids())
        if err != nil {
                return nil, nil, fmt.Errorf("UID mapping: %w", err)
        }

        gids, err = parseUsernsIDMap(userns.GetGids())
        if err != nil {
                return nil, nil, fmt.Errorf("GID mapping: %w", err)
        }

        switch mode := userns.GetMode(); mode {
        case runtime.NamespaceMode_NODE:
                if len(uids) != 0 || len(gids) != 0 {
                        return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
                }
        case runtime.NamespaceMode_POD:
                // This is valid, we will handle it in WithPodNamespaces().
                if len(uids) == 0 || len(gids) == 0 {
                        return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
                }
        default:
                return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
        }

        return uids, gids, nil
}

func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
        snapshotOpt := []snapshots.Opt{}
        usernsOpts := nsOpts.GetUsernsOptions()
        if usernsOpts == nil {
                return snapshotOpt, nil
        }

        uids, gids, err := parseUsernsIDs(usernsOpts)
        if err != nil {
                return nil, fmt.Errorf("user namespace configuration: %w", err)
        }

        if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
                snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
        }
        return snapshotOpt, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/log"
        "github.com/containerd/nri"
        v1 "github.com/containerd/nri/types/v1"
)

// WithNRISandboxDelete calls delete for a sandbox'd task
func WithNRISandboxDelete(sandboxID string) containerd.ProcessDeleteOpts {
        return func(ctx context.Context, p containerd.Process) error {
                task, ok := p.(containerd.Task)
                if !ok {
                        return nil
                }
                nric, err := nri.New()
                if err != nil {
                        log.G(ctx).WithError(err).Error("unable to create nri client")
                        return nil
                }
                if nric == nil {
                        return nil
                }
                sb := &nri.Sandbox{
                        ID: sandboxID,
                }
                if _, err := nric.InvokeWithSandbox(ctx, task, v1.Delete, sb); err != nil {
                        log.G(ctx).WithError(err).Errorf("Failed to delete nri for %q", task.ID())
                }
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"
        goruntime "runtime"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        sandbox2 "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox/types"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/netns"
)

// loadContainerTimeout is the default timeout for loading a container/sandbox.
// One container/sandbox hangs (e.g. containerd#2438) should not affect other
// containers/sandboxes.
// Most CRI container/sandbox related operations are per container, the ones
// which handle multiple containers at a time are:
// * ListPodSandboxes: Don't talk with containerd services.
// * ListContainers: Don't talk with containerd services.
// * ListContainerStats: Not in critical code path, a default timeout will
// be applied at CRI level.
// * Recovery logic: We should set a time for each container/sandbox recovery.
// * Event monitor: We should set a timeout for each container/sandbox event handling.
const loadContainerTimeout = 10 * time.Second

func (c *Controller) RecoverContainer(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
        ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
        defer cancel()
        var sandbox sandboxstore.Sandbox
        meta, err := getMetadata(ctx, cntr)
        if err != nil {
                return sandbox, err
        }

        // Load sandbox created timestamp.
        info, err := cntr.Info(ctx)
        if err != nil {
                return sandbox, fmt.Errorf("failed to get sandbox container info: %w", err)
        }

        s, ch, err := func() (sandboxstore.Status, <-chan containerd.ExitStatus, error) {
                status := sandboxstore.Status{
                        State: sandboxstore.StateUnknown,
                }
                var channel <-chan containerd.ExitStatus

                status.CreatedAt = info.CreatedAt

                // Load sandbox state.
                t, err := cntr.Task(ctx, nil)
                if err != nil && !errdefs.IsNotFound(err) {
                        return status, channel, fmt.Errorf("failed to load task: %w", err)
                }
                var taskStatus containerd.Status
                var notFound bool
                if errdefs.IsNotFound(err) {
                        // Task is not found.
                        notFound = true
                } else {
                        // Task is found. Get task status.
                        taskStatus, err = t.Status(ctx)
                        if err != nil {
                                // It's still possible that task is deleted during this window.
                                if !errdefs.IsNotFound(err) {
                                        return status, channel, fmt.Errorf("failed to get task status: %w", err)
                                }
                                notFound = true
                        }
                }
                if notFound {
                        // Task does not exist, set sandbox state as NOTREADY.
                        status.State = sandboxstore.StateNotReady
                } else {
                        if taskStatus.Status == containerd.Running {
                                exitCh, err := t.Wait(ctrdutil.NamespacedContext())
                                if err != nil {
                                        if !errdefs.IsNotFound(err) {
                                                return status, channel, fmt.Errorf("failed to wait for sandbox container task: %w", err)
                                        }
                                        status.State = sandboxstore.StateNotReady
                                } else {
                                        status.State = sandboxstore.StateReady
                                        status.Pid = t.Pid()
                                        channel = exitCh
                                }
                        } else {
                                // Task is not running. Delete the task and set sandbox state as NOTREADY.
                                if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
                                        return status, channel, fmt.Errorf("failed to delete task: %w", err)
                                }
                                status.State = sandboxstore.StateNotReady
                        }
                }
                return status, channel, nil
        }()
        if err != nil {
                log.G(ctx).WithError(err).Errorf("Failed to load sandbox status for %q", cntr.ID())
        }

        // save it to cache in the podsandbox controller
        podSandbox := types.NewPodSandbox(cntr.ID(), s)
        podSandbox.Container = cntr
        if meta != nil {
                podSandbox.Metadata = *meta
        }
        podSandbox.Runtime = sandbox2.RuntimeOpts{
                Name:    info.Runtime.Name,
                Options: info.Runtime.Options,
        }
        if ch != nil {
                go func() {
                        if err := c.waitSandboxExit(ctrdutil.NamespacedContext(), podSandbox, ch); err != nil {
                                log.G(context.Background()).Warnf("failed to wait pod sandbox exit %v", err)
                        }
                }()
        }

        if err := c.store.Save(podSandbox); err != nil {
                return sandbox, fmt.Errorf("failed to save pod sandbox container in mem store: %w", err)
        }

        sandbox = sandboxstore.NewSandbox(*meta, s)
        sandbox.Container = cntr
        sandbox.Sandboxer = string(config.ModePodSandbox)

        // Load network namespace.
        sandbox.NetNS = getNetNS(meta)

        // It doesn't matter whether task is running or not. If it is running, sandbox
        // status will be `READY`; if it is not running, sandbox status will be `NOT_READY`,
        // kubelet will stop the sandbox which will properly cleanup everything.
        return sandbox, nil
}

func getNetNS(meta *sandboxstore.Metadata) *netns.NetNS {
        // Don't need to load netns for host network sandbox.
        if hostNetwork(meta.Config) {
                return nil
        }
        return netns.LoadNetNS(meta.NetNSPath)
}

// hostNetwork handles checking if host networking was requested.
// TODO: Copy pasted from sbserver to handle container sandbox events in podsandbox/ package, needs refactoring.
func hostNetwork(config *runtime.PodSandboxConfig) bool {
        var hostNet bool
        switch goruntime.GOOS {
        case "windows":
                // Windows HostProcess pods can only run on the host network
                hostNet = config.GetWindows().GetSecurityContext().GetHostProcess()
        case "darwin":
                // No CNI on Darwin yet.
                hostNet = true
        default:
                // Even on other platforms, the logic containerd uses is to check if NamespaceMode == NODE.
                // So this handles Linux, as well as any other platforms not governed by the cases above
                // that have special quirks.
                hostNet = config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE
        }
        return hostNet
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"

        apitasks "github.com/containerd/containerd/api/services/tasks/v1"
        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
)

func (c *Controller) Shutdown(ctx context.Context, sandboxID string) error {
        sandbox := c.store.Get(sandboxID)
        if sandbox == nil {
                // Do not return error if the id doesn't exist.
                log.G(ctx).Tracef("Sandbox controller Delete called for sandbox %q that does not exist", sandboxID)
                return nil
        }

        // Cleanup the sandbox root directories.
        sandboxRootDir := c.getSandboxRootDir(sandboxID)
        if err := ensureRemoveAll(ctx, sandboxRootDir); err != nil {
                return fmt.Errorf("failed to remove sandbox root directory %q: %w", sandboxRootDir, err)
        }
        volatileSandboxRootDir := c.getVolatileSandboxRootDir(sandboxID)
        if err := ensureRemoveAll(ctx, volatileSandboxRootDir); err != nil {
                return fmt.Errorf("failed to remove volatile sandbox root directory %q: %w",
                        volatileSandboxRootDir, err)
        }

        // Delete sandbox container.
        if sandbox.Container != nil {
                if err := c.cleanupSandboxTask(ctx, sandbox.Container); err != nil {
                        return fmt.Errorf("failed to delete sandbox task %q: %w", sandboxID, err)
                }

                if err := sandbox.Container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to delete sandbox container %q: %w", sandboxID, err)
                        }
                        log.G(ctx).Tracef("Sandbox controller Delete called for sandbox container %q that does not exist", sandboxID)
                }
        }

        c.store.Remove(sandboxID)

        return nil
}

func (c *Controller) cleanupSandboxTask(ctx context.Context, sbCntr containerd.Container) error {
        task, err := sbCntr.Task(ctx, nil)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to load task for sandbox: %w", err)
                }
        } else {
                if _, err = task.Delete(ctx, containerd.WithProcessKill); err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to stop sandbox: %w", err)
                        }
                }
        }

        // NOTE: Both sb.Container.Task and task.Delete interface always ensures
        // that the status of target task. However, the interfaces return
        // ErrNotFound, which doesn't mean that the shim instance doesn't exist.
        //
        // There are two caches for task in containerd:
        //
        //   1. io.containerd.service.v1.tasks-service
        //   2. io.containerd.runtime.v2.task
        //
        // First one is to maintain the shim connection and shutdown the shim
        // in Delete API. And the second one is to maintain the lifecycle of
        // task in shim server.
        //
        // So, if the shim instance is running and task has been deleted in shim
        // server, the sb.Container.Task and task.Delete will receive the
        // ErrNotFound. If we don't delete the shim instance in io.containerd.service.v1.tasks-service,
        // shim will be leaky.
        //
        // Based on containerd/containerd#7496 issue, when host is under IO
        // pressure, the umount2 syscall will take more than 10 seconds so that
        // the CRI plugin will cancel this task.Delete call. However, the shim
        // server isn't aware about this. After return from umount2 syscall, the
        // shim server continue delete the task record. And then CRI plugin
        // retries to delete task and retrieves ErrNotFound and marks it as
        // stopped. Therefore, The shim is leaky.
        //
        // It's hard to handle the connection lost or request canceled cases in
        // shim server. We should call Delete API to io.containerd.service.v1.tasks-service
        // to ensure that shim instance is shutdown.
        //
        // REF:
        // 1. https://github.com/containerd/containerd/issues/7496#issuecomment-1671100968
        // 2. https://github.com/containerd/containerd/issues/8931
        if errdefs.IsNotFound(err) {
                _, err = c.client.TaskService().Delete(ctx, &apitasks.DeleteTaskRequest{ContainerID: sbCntr.ID()})
                if err != nil {
                        err = errdefs.FromGRPC(err)
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to cleanup sandbox %s in task-service: %w", sbCntr.ID(), err)
                        }
                }
                log.G(ctx).Infof("Ensure that sandbox %s in task-service has been cleanup successfully", sbCntr.ID())
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/containerd/nri"
        v1 "github.com/containerd/nri/types/v1"
        "github.com/containerd/typeurl/v2"
        "github.com/davecgh/go-spew/spew"
        "github.com/opencontainers/selinux/go-selinux"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/core/snapshots"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        customopts "github.com/containerd/containerd/v2/internal/cri/opts"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox/types"
        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        containerdio "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/errdefs"
)

func init() {
        typeurl.Register(&sandboxstore.Metadata{},
                "github.com/containerd/cri/pkg/store/sandbox", "Metadata")
}

type CleanupErr struct {
        error
}

// Start creates resources required for the sandbox and starts the sandbox.  If an error occurs, Start attempts to tear
// down the created resources.  If an error occurs while tearing down resources, a zero-valued response is returned
// alongside the error.  If the teardown was successful, a nil response is returned with the error.
// TODO(samuelkarp) Determine whether this error indication is reasonable to retain once controller.Delete is implemented.
func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.ControllerInstance, retErr error) {
        var cleanupErr error
        defer func() {
                if retErr != nil && cleanupErr != nil {
                        log.G(ctx).WithField("id", id).WithError(cleanupErr).Errorf("failed to fully teardown sandbox resources after earlier error: %s", retErr)
                        retErr = errors.Join(retErr, CleanupErr{cleanupErr})
                }
        }()
        podSandbox := c.store.Get(id)
        if podSandbox == nil {
                return cin, fmt.Errorf("unable to find pod sandbox with id %q: %w", id, errdefs.ErrNotFound)
        }
        metadata := podSandbox.Metadata

        var (
                config = metadata.Config
                labels = map[string]string{}
        )

        sandboxImage := c.getSandboxImageName()
        // Ensure sandbox container image snapshot.
        image, err := c.ensureImageExists(ctx, sandboxImage, config, metadata.RuntimeHandler)
        if err != nil {
                return cin, fmt.Errorf("failed to get sandbox image %q: %w", sandboxImage, err)
        }

        containerdImage, err := c.toContainerdImage(ctx, *image)
        if err != nil {
                return cin, fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err)
        }

        ociRuntime, err := c.config.GetSandboxRuntime(config, metadata.RuntimeHandler)
        if err != nil {
                return cin, fmt.Errorf("failed to get sandbox runtime: %w", err)
        }
        log.G(ctx).WithField("podsandboxid", id).Debugf("use OCI runtime %+v", ociRuntime)

        labels["oci_runtime_type"] = ociRuntime.Type

        // Create sandbox container.
        // NOTE: sandboxContainerSpec SHOULD NOT have side
        // effect, e.g. accessing/creating files, so that we can test
        // it safely.
        spec, err := c.sandboxContainerSpec(id, config, &image.ImageSpec.Config, metadata.NetNSPath, ociRuntime.PodAnnotations)
        if err != nil {
                return cin, fmt.Errorf("failed to generate sandbox container spec: %w", err)
        }
        log.G(ctx).WithField("podsandboxid", id).Debugf("sandbox container spec: %#+v", spew.NewFormatter(spec))

        metadata.ProcessLabel = spec.Process.SelinuxLabel
        defer func() {
                if retErr != nil {
                        selinux.ReleaseLabel(metadata.ProcessLabel)
                }
        }()
        labels["selinux_label"] = metadata.ProcessLabel

        // handle any KVM based runtime
        if err := modifyProcessLabel(ociRuntime.Type, spec); err != nil {
                return cin, err
        }

        if config.GetLinux().GetSecurityContext().GetPrivileged() {
                // If privileged don't set selinux label, but we still record the MCS label so that
                // the unused label can be freed later.
                spec.Process.SelinuxLabel = ""
        }

        // Generate spec options that will be applied to the spec later.
        specOpts, err := c.sandboxContainerSpecOpts(config, &image.ImageSpec.Config)
        if err != nil {
                return cin, fmt.Errorf("failed to generate sandbox container spec options: %w", err)
        }

        sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, crilabels.ContainerKindSandbox)

        snapshotterOpt := []snapshots.Opt{snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))}
        extraSOpts, err := sandboxSnapshotterOpts(config)
        if err != nil {
                return cin, err
        }
        snapshotterOpt = append(snapshotterOpt, extraSOpts...)

        opts := []containerd.NewContainerOpts{
                containerd.WithSnapshotter(c.imageService.RuntimeSnapshotter(ctx, ociRuntime)),
                customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt...),
                containerd.WithSpec(spec, specOpts...),
                containerd.WithContainerLabels(sandboxLabels),
                containerd.WithContainerExtension(crilabels.SandboxMetadataExtension, &metadata),
                containerd.WithRuntime(ociRuntime.Type, podSandbox.Runtime.Options),
        }

        container, err := c.client.NewContainer(ctx, id, opts...)
        if err != nil {
                return cin, fmt.Errorf("failed to create containerd container: %w", err)
        }
        podSandbox.Container = container
        defer func() {
                if retErr != nil && cleanupErr == nil {
                        deferCtx, deferCancel := ctrdutil.DeferContext()
                        defer deferCancel()
                        if cleanupErr = container.Delete(deferCtx, containerd.WithSnapshotCleanup); cleanupErr != nil {
                                log.G(ctx).WithError(cleanupErr).Errorf("Failed to delete containerd container %q", id)
                        }
                        podSandbox.Container = nil
                }
        }()

        // Create sandbox container root directories.
        sandboxRootDir := c.getSandboxRootDir(id)
        if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
                return cin, fmt.Errorf("failed to create sandbox root directory %q: %w",
                        sandboxRootDir, err)
        }
        defer func() {
                if retErr != nil && cleanupErr == nil {
                        // Cleanup the sandbox root directory.
                        if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil {
                                log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q",
                                        sandboxRootDir)
                        }
                }
        }()

        volatileSandboxRootDir := c.getVolatileSandboxRootDir(id)
        if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil {
                return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w",
                        volatileSandboxRootDir, err)
        }
        defer func() {
                if retErr != nil && cleanupErr == nil {
                        // Cleanup the volatile sandbox root directory.
                        if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil {
                                log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q",
                                        volatileSandboxRootDir)
                        }
                }
        }()

        // Setup files required for the sandbox.
        if err = c.setupSandboxFiles(id, config); err != nil {
                return cin, fmt.Errorf("failed to setup sandbox files: %w", err)
        }
        defer func() {
                if retErr != nil && cleanupErr == nil {
                        if cleanupErr = c.cleanupSandboxFiles(id, config); cleanupErr != nil {
                                log.G(ctx).WithError(cleanupErr).Errorf("Failed to cleanup sandbox files in %q",
                                        sandboxRootDir)
                        }
                }
        }()

        // Update sandbox created timestamp.
        info, err := container.Info(ctx)
        if err != nil {
                return cin, fmt.Errorf("failed to get sandbox container info: %w", err)
        }

        // Create sandbox task in containerd.
        log.G(ctx).Tracef("Create sandbox container (id=%q, name=%q).", id, metadata.Name)

        var taskOpts []containerd.NewTaskOpts
        if ociRuntime.Path != "" {
                taskOpts = append(taskOpts, containerd.WithRuntimePath(ociRuntime.Path))
        }

        // We don't need stdio for sandbox container.
        task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
        if err != nil {
                return cin, fmt.Errorf("failed to create containerd task: %w", err)
        }
        defer func() {
                if retErr != nil && cleanupErr == nil {
                        deferCtx, deferCancel := ctrdutil.DeferContext()
                        defer deferCancel()
                        // Cleanup the sandbox container if an error is returned.
                        if _, err := task.Delete(deferCtx, WithNRISandboxDelete(id), containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
                                log.G(ctx).WithError(err).Errorf("Failed to delete sandbox container %q", id)
                                cleanupErr = err
                        }
                }
        }()

        // wait is a long running background request, no timeout needed.
        exitCh, err := task.Wait(ctrdutil.NamespacedContext())
        if err != nil {
                return cin, fmt.Errorf("failed to wait for sandbox container task: %w", err)
        }

        nric, err := nri.New()
        if err != nil {
                return cin, fmt.Errorf("unable to create nri client: %w", err)
        }
        if nric != nil {
                nriSB := &nri.Sandbox{
                        ID:     id,
                        Labels: config.Labels,
                }
                if _, err := nric.InvokeWithSandbox(ctx, task, v1.Create, nriSB); err != nil {
                        return cin, fmt.Errorf("nri invoke: %w", err)
                }
        }

        if err := task.Start(ctx); err != nil {
                return cin, fmt.Errorf("failed to start sandbox container task %q: %w", id, err)
        }
        pid := task.Pid()
        if err := podSandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
                status.Pid = pid
                status.State = sandboxstore.StateReady
                status.CreatedAt = info.CreatedAt
                return status, nil
        }); err != nil {
                return cin, fmt.Errorf("failed to update status of pod sandbox %q: %w", id, err)
        }

        cin.SandboxID = id
        cin.Pid = task.Pid()
        cin.CreatedAt = info.CreatedAt
        cin.Labels = labels

        go func() {
                if err := c.waitSandboxExit(ctrdutil.NamespacedContext(), podSandbox, exitCh); err != nil {
                        log.G(context.Background()).Warnf("failed to wait pod sandbox exit %v", err)
                }
        }()

        return
}

func (c *Controller) Create(_ctx context.Context, info sandbox.Sandbox, opts ...sandbox.CreateOpt) error {
        metadata := sandboxstore.Metadata{}
        if err := info.GetExtension(MetadataKey, &metadata); err != nil {
                return fmt.Errorf("failed to get sandbox %q metadata: %w", info.ID, err)
        }
        podSandbox := types.NewPodSandbox(info.ID, sandboxstore.Status{State: sandboxstore.StateUnknown})
        podSandbox.Metadata = metadata
        podSandbox.Runtime = info.Runtime
        return c.store.Save(podSandbox)
}

func (c *Controller) ensureImageExists(ctx context.Context, ref string, config *runtime.PodSandboxConfig, runtimeHandler string) (*imagestore.Image, error) {
        image, err := c.imageService.LocalResolve(ref)
        if err != nil && !errdefs.IsNotFound(err) {
                return nil, fmt.Errorf("failed to get image %q: %w", ref, err)
        }
        if err == nil {
                return &image, nil
        }
        // Pull image to ensure the image exists
        // TODO: Cleaner interface
        imageID, err := c.imageService.PullImage(ctx, ref, nil, config, runtimeHandler)
        if err != nil {
                return nil, fmt.Errorf("failed to pull image %q: %w", ref, err)
        }
        newImage, err := c.imageService.GetImage(imageID)
        if err != nil {
                // It's still possible that someone removed the image right after it is pulled.
                return nil, fmt.Errorf("failed to get image %q after pulling: %w", imageID, err)
        }
        return &newImage, nil
}

func (c *Controller) getSandboxImageName() string {
        // returns the name of the sandbox image used to scope pod shared resources used by the pod's containers,
        // if empty return the default sandbox image.
        if c.imageService != nil {
                sandboxImage := c.imageService.PinnedImage("sandbox")
                if sandboxImage != "" {
                        return sandboxImage
                }
        }
        return criconfig.DefaultSandboxImage
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "fmt"
        "os"
        "strconv"
        "strings"

        "github.com/containerd/containerd/v2/pkg/oci"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/selinux/go-selinux"
        "golang.org/x/sys/unix"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/cri/annotations"
        customopts "github.com/containerd/containerd/v2/internal/cri/opts"
        "github.com/containerd/containerd/v2/pkg/userns"
)

func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
        imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (_ *runtimespec.Spec, retErr error) {
        // Creates a spec Generator with the default spec.
        // TODO(random-liu): [P1] Compare the default settings with docker and containerd default.
        specOpts := []oci.SpecOpts{
                oci.WithoutRunMount,
                customopts.WithoutDefaultSecuritySettings,
                customopts.WithRelativeRoot(relativeRootfsPath),
                oci.WithEnv(imageConfig.Env),
                oci.WithRootFSReadonly(),
                oci.WithHostname(config.GetHostname()),
        }
        if imageConfig.WorkingDir != "" {
                specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
        }

        if len(imageConfig.Entrypoint) == 0 && len(imageConfig.Cmd) == 0 {
                // Pause image must have entrypoint or cmd.
                return nil, fmt.Errorf("invalid empty entrypoint and cmd in image config %+v", imageConfig)
        }
        specOpts = append(specOpts, oci.WithProcessArgs(append(imageConfig.Entrypoint, imageConfig.Cmd...)...))

        // Set cgroups parent.
        if c.config.DisableCgroup {
                specOpts = append(specOpts, customopts.WithDisabledCgroups)
        } else {
                if config.GetLinux().GetCgroupParent() != "" {
                        cgroupsPath := getCgroupsPath(config.GetLinux().GetCgroupParent(), id)
                        specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
                }
        }

        // When cgroup parent is not set, containerd-shim will create container in a child cgroup
        // of the cgroup itself is in.
        // TODO(random-liu): [P2] Set default cgroup path if cgroup parent is not specified.

        // Set namespace options.
        var (
                securityContext = config.GetLinux().GetSecurityContext()
                nsOptions       = securityContext.GetNamespaceOptions()
        )
        if nsOptions.GetNetwork() == runtime.NamespaceMode_NODE {
                specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.NetworkNamespace))
                specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UTSNamespace))
        } else {
                specOpts = append(specOpts, oci.WithLinuxNamespace(
                        runtimespec.LinuxNamespace{
                                Type: runtimespec.NetworkNamespace,
                                Path: nsPath,
                        }))
        }
        if nsOptions.GetPid() == runtime.NamespaceMode_NODE {
                specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.PIDNamespace))
        }
        if nsOptions.GetIpc() == runtime.NamespaceMode_NODE {
                specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace))
        }

        usernsOpts := nsOptions.GetUsernsOptions()
        uids, gids, err := parseUsernsIDs(usernsOpts)
        var usernsEnabled bool
        if err != nil {
                return nil, fmt.Errorf("user namespace configuration: %w", err)
        }

        if usernsOpts != nil {
                switch mode := usernsOpts.GetMode(); mode {
                case runtime.NamespaceMode_NODE:
                        specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace))
                case runtime.NamespaceMode_POD:
                        specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
                        usernsEnabled = true
                default:
                        return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
                }
        }

        // It's fine to generate the spec before the sandbox /dev/shm
        // is actually created.
        sandboxDevShm := c.getSandboxDevShm(id)
        if nsOptions.GetIpc() == runtime.NamespaceMode_NODE {
                sandboxDevShm = devShm
        }
        // Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go.
        specOpts = append(specOpts, oci.WithoutMounts(devShm))
        // When user-namespace is enabled, the `nosuid, nodev, noexec` flags are
        // required, otherwise the remount will fail with EPERM. Just use them
        // unconditionally, they are nice to have anyways.
        specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{
                {
                        Source:      sandboxDevShm,
                        Destination: devShm,
                        Type:        "bind",
                        Options:     []string{"rbind", "ro", "nosuid", "nodev", "noexec"},
                },
                // Add resolv.conf for katacontainers to setup the DNS of pod VM properly.
                {
                        Source:      c.getResolvPath(id),
                        Destination: resolvConfPath,
                        Type:        "bind",
                        Options:     []string{"rbind", "ro", "nosuid", "nodev", "noexec"},
                },
        }))

        processLabel, mountLabel, err := initLabelsFromOpt(securityContext.GetSelinuxOptions())
        if err != nil {
                return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
        }
        defer func() {
                if retErr != nil {
                        selinux.ReleaseLabel(processLabel)
                }
        }()

        supplementalGroups := securityContext.GetSupplementalGroups()
        specOpts = append(specOpts,
                customopts.WithSelinuxLabels(processLabel, mountLabel),
                customopts.WithSupplementalGroups(supplementalGroups),
        )

        // Add sysctls
        sysctls := config.GetLinux().GetSysctls()
        if sysctls == nil {
                sysctls = make(map[string]string)
        }
        _, ipUnprivilegedPortStart := sysctls["net.ipv4.ip_unprivileged_port_start"]
        _, pingGroupRange := sysctls["net.ipv4.ping_group_range"]
        if nsOptions.GetNetwork() != runtime.NamespaceMode_NODE {
                if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
                        sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
                }
                if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() && !usernsEnabled {
                        sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
                }
        }
        specOpts = append(specOpts, customopts.WithSysctls(sysctls))

        // Note: LinuxSandboxSecurityContext does not currently provide an apparmor profile

        if !c.config.DisableCgroup {
                specOpts = append(specOpts, customopts.WithDefaultSandboxShares)
        }

        if res := config.GetLinux().GetResources(); res != nil {
                specOpts = append(specOpts,
                        customopts.WithAnnotation(annotations.SandboxCPUPeriod, strconv.FormatInt(res.CpuPeriod, 10)),
                        customopts.WithAnnotation(annotations.SandboxCPUQuota, strconv.FormatInt(res.CpuQuota, 10)),
                        customopts.WithAnnotation(annotations.SandboxCPUShares, strconv.FormatInt(res.CpuShares, 10)),
                        customopts.WithAnnotation(annotations.SandboxMem, strconv.FormatInt(res.MemoryLimitInBytes, 10)))
        }

        specOpts = append(specOpts, customopts.WithPodOOMScoreAdj(int(defaultSandboxOOMAdj), c.config.RestrictOOMScoreAdj))

        for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
                runtimePodAnnotations) {
                specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
        }

        specOpts = append(specOpts, annotations.DefaultCRIAnnotations(id, "", c.getSandboxImageName(), config, true)...)

        return c.runtimeSpec(id, "", specOpts...)
}

// sandboxContainerSpecOpts generates OCI spec options for
// the sandbox container.
func (c *Controller) sandboxContainerSpecOpts(config *runtime.PodSandboxConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
        var (
                securityContext = config.GetLinux().GetSecurityContext()
                specOpts        []oci.SpecOpts
                err             error
        )
        ssp := securityContext.GetSeccomp()
        if ssp == nil {
                ssp, err = generateSeccompSecurityProfile(
                        securityContext.GetSeccompProfilePath(), //nolint:staticcheck // Deprecated but we don't want to remove yet
                        c.config.UnsetSeccompProfile)
                if err != nil {
                        return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err)
                }
        }
        seccompSpecOpts, err := c.generateSeccompSpecOpts(
                ssp,
                securityContext.GetPrivileged(),
                c.seccompEnabled())
        if err != nil {
                return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err)
        }
        if seccompSpecOpts != nil {
                specOpts = append(specOpts, seccompSpecOpts)
        }

        userstr, err := generateUserString(
                "",
                securityContext.GetRunAsUser(),
                securityContext.GetRunAsGroup(),
        )
        if err != nil {
                return nil, fmt.Errorf("failed to generate user string: %w", err)
        }
        if userstr == "" {
                // Lastly, since no user override was passed via CRI try to set via OCI
                // Image
                userstr = imageConfig.User
        }
        if userstr != "" {
                specOpts = append(specOpts, oci.WithUser(userstr))
        }
        return specOpts, nil
}

// setupSandboxFiles sets up necessary sandbox files including /dev/shm, /etc/hosts,
// /etc/resolv.conf and /etc/hostname.
func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
        sandboxEtcHostname := c.getSandboxHostname(id)
        hostname := config.GetHostname()
        if hostname == "" {
                var err error
                hostname, err = c.os.Hostname()
                if err != nil {
                        return fmt.Errorf("failed to get hostname: %w", err)
                }
        }
        if err := c.os.WriteFile(sandboxEtcHostname, []byte(hostname+"\n"), 0644); err != nil {
                return fmt.Errorf("failed to write hostname to %q: %w", sandboxEtcHostname, err)
        }

        // TODO(random-liu): Consider whether we should maintain /etc/hosts and /etc/resolv.conf in kubelet.
        sandboxEtcHosts := c.getSandboxHosts(id)
        if err := c.os.CopyFile(etcHosts, sandboxEtcHosts, 0644); err != nil {
                return fmt.Errorf("failed to generate sandbox hosts file %q: %w", sandboxEtcHosts, err)
        }

        // Set DNS options. Maintain a resolv.conf for the sandbox.
        resolvPath := c.getResolvPath(id)

        if dnsConfig := config.GetDnsConfig(); dnsConfig != nil {
                resolvContent, err := parseDNSOptions(dnsConfig.Servers, dnsConfig.Searches, dnsConfig.Options)
                if err != nil {
                        return fmt.Errorf("failed to parse sandbox DNSConfig %+v: %w", dnsConfig, err)
                }
                if err := c.os.WriteFile(resolvPath, []byte(resolvContent), 0644); err != nil {
                        return fmt.Errorf("failed to write resolv content to %q: %w", resolvPath, err)
                }
        } else {
                // The DnsConfig was nil - we interpret that to mean "use the global
                // default", which is dubious but backwards-compatible.
                if err := c.os.CopyFile(resolvConfPath, resolvPath, 0644); err != nil {
                        return fmt.Errorf("failed to copy host's resolv.conf to %q: %w", resolvPath, err)
                }
        }

        // Setup sandbox /dev/shm.
        if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetIpc() == runtime.NamespaceMode_NODE {
                if _, err := c.os.Stat(devShm); err != nil {
                        return fmt.Errorf("host %q is not available for host ipc: %w", devShm, err)
                }
        } else {
                sandboxDevShm := c.getSandboxDevShm(id)
                if err := c.os.MkdirAll(sandboxDevShm, 0700); err != nil {
                        return fmt.Errorf("failed to create sandbox shm: %w", err)
                }
                shmproperty := fmt.Sprintf("mode=1777,size=%d", defaultShmSize)
                if err := c.os.Mount("shm", sandboxDevShm, "tmpfs", uintptr(unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV), shmproperty); err != nil {
                        return fmt.Errorf("failed to mount sandbox shm: %w", err)
                }
        }

        return nil
}

// parseDNSOptions parse DNS options into resolv.conf format content,
// if none option is specified, will return empty with no error.
func parseDNSOptions(servers, searches, options []string) (string, error) {
        resolvContent := ""

        if len(searches) > 0 {
                resolvContent += fmt.Sprintf("search %s\n", strings.Join(searches, " "))
        }

        if len(servers) > 0 {
                resolvContent += fmt.Sprintf("nameserver %s\n", strings.Join(servers, "\nnameserver "))
        }

        if len(options) > 0 {
                resolvContent += fmt.Sprintf("options %s\n", strings.Join(options, " "))
        }

        return resolvContent, nil
}

// cleanupSandboxFiles unmount some sandbox files, we rely on the removal of sandbox root directory to
// remove these files. Unmount should *NOT* return error if the mount point is already unmounted.
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
        if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetIpc() != runtime.NamespaceMode_NODE {
                path, err := c.os.FollowSymlinkInScope(c.getSandboxDevShm(id), "/")
                if err != nil {
                        return fmt.Errorf("failed to follow symlink: %w", err)
                }
                if err := c.os.Unmount(path); err != nil && !os.IsNotExist(err) {
                        return fmt.Errorf("failed to unmount %q: %w", path, err)
                }
        }
        return nil
}

// sandboxSnapshotterOpts generates any platform specific snapshotter options
// for a sandbox container.
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
        nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
        return snapshotterRemapOpts(nsOpts)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"

        "github.com/containerd/containerd/api/types"
        "github.com/containerd/errdefs"
)

// TODO(dcantah): Implement metrics to be used for SandboxStats rpc.
func (c *Controller) Metrics(ctx context.Context, sandboxID string) (*types.Metric, error) {
        return nil, errdefs.ErrNotImplemented
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "encoding/json"
        "fmt"

        "github.com/containerd/typeurl/v2"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox/types"
        critypes "github.com/containerd/containerd/v2/internal/cri/types"
        "github.com/containerd/errdefs"
)

func (c *Controller) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
        sb := c.store.Get(sandboxID)
        if sb == nil {
                return sandbox.ControllerStatus{}, fmt.Errorf("unable to find sandbox %q: %w", sandboxID, errdefs.ErrNotFound)
        }
        status := sb.Status.Get()
        cstatus := sandbox.ControllerStatus{
                SandboxID: sandboxID,
                Pid:       status.Pid,
                State:     status.State.String(),
                CreatedAt: status.CreatedAt,
                ExitedAt:  status.ExitedAt,
                Extra:     nil,
        }

        if verbose {
                info, err := toCRISandboxInfo(ctx, sb)
                if err != nil {
                        return sandbox.ControllerStatus{}, err
                }

                cstatus.Info = info
        }

        return cstatus, nil
}

// toCRISandboxInfo converts internal container object information to CRI sandbox status response info map.
func toCRISandboxInfo(ctx context.Context, sb *types.PodSandbox) (map[string]string, error) {
        si := &critypes.SandboxInfo{
                Pid:            sb.Status.Get().Pid,
                Config:         sb.Metadata.Config,
                RuntimeHandler: sb.Metadata.RuntimeHandler,
                CNIResult:      sb.Metadata.CNIResult,
                Metadata:       &sb.Metadata,
        }

        if container := sb.Container; container != nil {
                task, err := container.Task(ctx, nil)
                if err != nil && !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("failed to get sandbox container task: %w", err)
                }

                var processStatus containerd.ProcessStatus
                if task != nil {
                        if taskStatus, err := task.Status(ctx); err != nil {
                                if !errdefs.IsNotFound(err) {
                                        return nil, fmt.Errorf("failed to get task status: %w", err)
                                }
                                processStatus = containerd.Unknown
                        } else {
                                processStatus = taskStatus.Status
                        }
                }
                si.Status = string(processStatus)

                spec, err := container.Spec(ctx)
                if err != nil {
                        return nil, fmt.Errorf("failed to get sandbox container runtime spec: %w", err)
                }
                si.RuntimeSpec = spec

                ctrInfo, err := container.Info(ctx)
                if err != nil {
                        return nil, fmt.Errorf("failed to get sandbox container info: %w", err)
                }
                // Do not use config.SandboxImage because the configuration might
                // be changed during restart. It may not reflect the actual image
                // used by the sandbox container.
                si.Image = ctrInfo.Image
                si.SnapshotKey = ctrInfo.SnapshotKey
                si.Snapshotter = ctrInfo.Snapshotter

                runtimeOptions, err := getRuntimeOptions(ctrInfo)
                if err != nil {
                        return nil, fmt.Errorf("failed to get runtime options: %w", err)
                }

                si.RuntimeType = ctrInfo.Runtime.Name
                si.RuntimeOptions = runtimeOptions
        }

        if si.Status == "" {
                // If processStatus is empty, it means that the task is deleted. Apply "deleted"
                // status which does not exist in containerd.
                si.Status = "deleted"
        }
        netns := getNetNS(&sb.Metadata)
        if netns != nil {
                // Add network closed information if sandbox is not using host network.
                closed, err := netns.Closed()
                if err != nil {
                        return nil, fmt.Errorf("failed to check network namespace closed: %w", err)
                }
                si.NetNSClosed = closed
        }

        infoBytes, err := json.Marshal(si)
        if err != nil {
                return nil, fmt.Errorf("failed to marshal info %v: %w", si, err)
        }

        return map[string]string{
                "info": string(infoBytes),
        }, nil
}

// getRuntimeOptions get runtime options from container metadata.
func getRuntimeOptions(c containers.Container) (interface{}, error) {
        from := c.Runtime.Options
        if from == nil || from.GetValue() == nil {
                return nil, nil
        }
        opts, err := typeurl.UnmarshalAny(from)
        if err != nil {
                return nil, err
        }
        return opts, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "context"
        "fmt"
        "syscall"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        eventtypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox/types"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/protobuf"
)

func (c *Controller) Stop(ctx context.Context, sandboxID string, _ ...sandbox.StopOpt) error {
        podSandbox := c.store.Get(sandboxID)
        if podSandbox == nil {
                return errdefs.ErrNotFound
        }
        if podSandbox.Container == nil {
                return nil
        }
        meta, err := getMetadata(ctx, podSandbox.Container)
        if err != nil {
                return err
        }
        state := podSandbox.Status.Get().State
        if state == sandboxstore.StateReady || state == sandboxstore.StateUnknown {
                if err := c.stopSandboxContainer(ctx, podSandbox); err != nil {
                        return fmt.Errorf("failed to stop sandbox container %q in %q state: %w", sandboxID, state, err)
                }
        }
        if err := c.cleanupSandboxFiles(sandboxID, meta.Config); err != nil {
                return fmt.Errorf("failed to cleanup sandbox files: %w", err)
        }
        return nil
}

// stopSandboxContainer kills the sandbox container.
// `task.Delete` is not called here because it will be called when
// the event monitor handles the `TaskExit` event.
func (c *Controller) stopSandboxContainer(ctx context.Context, podSandbox *types.PodSandbox) error {
        id := podSandbox.ID
        container := podSandbox.Container
        state := podSandbox.Status.Get().State
        task, err := container.Task(ctx, nil)
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to get pod sandbox container: %w", err)
                }
                // Don't return for unknown state, some cleanup needs to be done.
                if state == sandboxstore.StateUnknown {
                        return cleanupUnknownSandbox(ctx, id, podSandbox)
                }
                return nil
        }

        // Handle unknown state.
        // The cleanup logic is the same with container unknown state.
        if state == sandboxstore.StateUnknown {
                // Start an exit handler for sandbox container in unknown state.
                waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
                defer waitCancel()
                exitCh, err := task.Wait(waitCtx)
                if err != nil {
                        if !errdefs.IsNotFound(err) {
                                return fmt.Errorf("failed to wait for task: %w", err)
                        }
                        return cleanupUnknownSandbox(ctx, id, podSandbox)
                }

                exitCtx, exitCancel := context.WithCancel(context.Background())
                stopCh := make(chan struct{})
                go func() {
                        defer close(stopCh)
                        err := c.waitSandboxExit(exitCtx, podSandbox, exitCh)
                        if err != nil && err != context.Canceled && err != context.DeadlineExceeded {
                                log.G(ctx).WithError(err).Errorf("Failed to wait sandbox exit %+v", err)
                        }
                }()
                defer func() {
                        exitCancel()
                        // This ensures that exit monitor is stopped before
                        // `Wait` is cancelled, so no exit event is generated
                        // because of the `Wait` cancellation.
                        <-stopCh
                }()
        }

        // Kill the pod sandbox container.
        if err = task.Kill(ctx, syscall.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
                return fmt.Errorf("failed to kill pod sandbox container: %w", err)
        }

        _, err = podSandbox.Wait(ctx)
        return err
}

// cleanupUnknownSandbox cleanup stopped sandbox in unknown state.
func cleanupUnknownSandbox(ctx context.Context, id string, sandbox *types.PodSandbox) error {
        // Reuse handleSandboxTaskExit to do the cleanup.
        return handleSandboxTaskExit(ctx, sandbox, &eventtypes.TaskExit{ExitStatus: unknownExitCode, ExitedAt: protobuf.ToTimestamp(time.Now())})
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package podsandbox

import (
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox/types"
)

type Store struct {
        m sync.Map
}

func NewStore() *Store {
        return &Store{}
}

func (s *Store) Save(p *types.PodSandbox) error {
        if p == nil {
                return fmt.Errorf("pod sandbox should not be nil")
        }
        s.m.Store(p.ID, p)
        return nil
}

func (s *Store) Get(id string) *types.PodSandbox {
        i, ok := s.m.Load(id)
        if !ok {
                return nil
        }
        return i.(*types.PodSandbox)
}

func (s *Store) Remove(id string) *types.PodSandbox {
        i, ok := s.m.LoadAndDelete(id)
        if !ok {
                return nil
        }
        return i.(*types.PodSandbox)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package types

import (
        "context"
        "time"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/store"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
)

type PodSandbox struct {
        ID        string
        Container containerd.Container
        Metadata  sandboxstore.Metadata
        Runtime   sandbox.RuntimeOpts
        Status    sandboxstore.StatusStorage
        stopChan  *store.StopCh
}

func NewPodSandbox(id string, status sandboxstore.Status) *PodSandbox {
        podSandbox := &PodSandbox{
                ID:        id,
                Container: nil,
                stopChan:  store.NewStopCh(),
                Status:    sandboxstore.StoreStatus(status),
        }
        if status.State == sandboxstore.StateNotReady {
                podSandbox.stopChan.Stop()
        }
        return podSandbox
}

func (p *PodSandbox) Exit(code uint32, exitTime time.Time) error {
        if err := p.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
                status.State = sandboxstore.StateNotReady
                status.ExitStatus = code
                status.ExitedAt = exitTime
                status.Pid = 0
                return status, nil
        }); err != nil {
                return err
        }
        p.stopChan.Stop()
        return nil
}

func (p *PodSandbox) Wait(ctx context.Context) (containerd.ExitStatus, error) {
        select {
        case <-ctx.Done():
                return containerd.ExitStatus{}, ctx.Err()
        case <-p.stopChan.Stopped():
                status := p.Status.Get()
                return *containerd.NewExitStatus(status.ExitStatus, status.ExitedAt, nil), nil
        }
}

//go:build !no_rdt

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "fmt"

        "github.com/containerd/containerd/v2/pkg/rdt"
        "github.com/containerd/log"
)

// rdtClassFromAnnotations examines container and pod annotations of a
// container and returns its effective RDT class.
func (c *criService) rdtClassFromAnnotations(containerName string, containerAnnotations, podAnnotations map[string]string) (string, error) {
        cls, err := rdt.ContainerClassFromAnnotations(containerName, containerAnnotations, podAnnotations)

        if err == nil {
                // Our internal check that RDT has been enabled
                if cls != "" && !rdt.IsEnabled() {
                        err = fmt.Errorf("RDT disabled, refusing to set RDT class of container %q to %q", containerName, cls)
                }
        }

        if err != nil {
                if !rdt.IsEnabled() && c.config.ContainerdConfig.IgnoreRdtNotEnabledErrors {
                        log.L.Debugf("continuing create container %s, ignoring rdt not enabled (%v)", containerName, err)
                        return "", nil
                }
                return "", err
        }

        return cls, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "time"

        containerd "github.com/containerd/containerd/v2/client"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        crilabels "github.com/containerd/containerd/v2/internal/cri/labels"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox"
        containerdio "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/containerd/v2/pkg/netns"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        "golang.org/x/sync/errgroup"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        cio "github.com/containerd/containerd/v2/internal/cri/io"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
)

// NOTE: The recovery logic has following assumption: when the cri plugin is down:
// 1) Files (e.g. root directory, netns) and checkpoint maintained by the plugin MUST NOT be
// touched. Or else, recovery logic for those containers/sandboxes may return error.
// 2) Containerd containers may be deleted, but SHOULD NOT be added. Or else, recovery logic
// for the newly added container/sandbox will return error, because there is no corresponding root
// directory created.
// 3) Containerd container tasks may exit or be stopped, deleted. Even though current logic could
// tolerant tasks being created or started, we prefer that not to happen.

// recover recovers system state from containerd and status checkpoint.
func (c *criService) recover(ctx context.Context) error {
        // Recover all sandboxes.
        sandboxes, err := c.client.Containers(ctx, filterLabel(crilabels.ContainerKindLabel, crilabels.ContainerKindSandbox))
        if err != nil {
                return fmt.Errorf("failed to list sandbox containers: %w", err)
        }

        podSandboxController, err := c.sandboxService.SandboxController(string(criconfig.ModePodSandbox))
        if err != nil {
                return fmt.Errorf("failed to get podsanbox controller %v", err)
        }
        podSandboxLoader, ok := podSandboxController.(podSandboxRecover)
        if !ok {
                log.G(ctx).Fatal("pod sandbox controller doesn't support recovery")
        }

        eg, ctx2 := errgroup.WithContext(ctx)
        for _, sandbox := range sandboxes {
                sandbox := sandbox
                eg.Go(func() error {
                        sb, err := podSandboxLoader.RecoverContainer(ctx2, sandbox)
                        if err != nil {
                                log.G(ctx2).
                                        WithError(err).
                                        WithField("sandbox", sandbox.ID()).
                                        Error("Failed to load sandbox")

                                return nil
                        }
                        log.G(ctx2).Debugf("Loaded sandbox %+v", sb)
                        if err := c.sandboxStore.Add(sb); err != nil {
                                return fmt.Errorf("failed to add sandbox %q to store: %w", sandbox.ID(), err)
                        }
                        if err := c.sandboxNameIndex.Reserve(sb.Name, sb.ID); err != nil {
                                return fmt.Errorf("failed to reserve sandbox name %q: %w", sb.Name, err)
                        }
                        return nil
                })
        }
        if err := eg.Wait(); err != nil {
                return err
        }

        // Recover sandboxes in the new SandboxStore
        storedSandboxes, err := c.client.SandboxStore().List(ctx)
        if err != nil {
                return fmt.Errorf("failed to list sandboxes from API: %w", err)
        }
        for _, sbx := range storedSandboxes {
                if _, err := c.sandboxStore.Get(sbx.ID); err == nil {
                        continue
                }

                metadata := sandboxstore.Metadata{}
                err := sbx.GetExtension(podsandbox.MetadataKey, &metadata)
                if err != nil {
                        return fmt.Errorf("failed to get metadata for stored sandbox %q: %w", sbx.ID, err)
                }

                var (
                        state      = sandboxstore.StateUnknown
                        controller = c.client.SandboxController(sbx.Sandboxer)
                        endpoint   sandboxstore.Endpoint
                )

                status, err := controller.Status(ctx, sbx.ID, false)
                if err != nil {
                        log.G(ctx).
                                WithError(err).
                                WithField("sandbox", sbx.ID).
                                Error("failed to recover sandbox state")

                        if errdefs.IsNotFound(err) {
                                state = sandboxstore.StateNotReady
                        }
                } else {
                        endpoint.Version = status.Version
                        endpoint.Address = status.Address
                        if code, ok := runtime.PodSandboxState_value[status.State]; ok {
                                if code == int32(runtime.PodSandboxState_SANDBOX_READY) {
                                        state = sandboxstore.StateReady
                                } else if code == int32(runtime.PodSandboxState_SANDBOX_NOTREADY) {
                                        state = sandboxstore.StateNotReady
                                }
                        }
                }

                sb := sandboxstore.NewSandbox(metadata, sandboxstore.Status{State: state})
                sb.Sandboxer = sbx.Sandboxer
                sb.Endpoint = endpoint

                // Load network namespace.
                sb.NetNS = getNetNS(&metadata)

                if err := c.sandboxStore.Add(sb); err != nil {
                        return fmt.Errorf("failed to add stored sandbox %q to store: %w", sbx.ID, err)
                }
        }

        for _, sb := range c.sandboxStore.List() {
                sb := sb
                status := sb.Status.Get()
                if status.State == sandboxstore.StateNotReady {
                        continue
                }
                exitCh, err := c.sandboxService.WaitSandbox(ctrdutil.NamespacedContext(), sb.Sandboxer, sb.ID)
                if err != nil {
                        log.G(ctx).WithError(err).Error("failed to wait sandbox")
                        continue
                }
                c.startSandboxExitMonitor(context.Background(), sb.ID, exitCh)
        }
        // Recover all containers.
        containers, err := c.client.Containers(ctx, filterLabel(crilabels.ContainerKindLabel, crilabels.ContainerKindContainer))
        if err != nil {
                return fmt.Errorf("failed to list containers: %w", err)
        }
        eg, ctx2 = errgroup.WithContext(ctx)
        for _, container := range containers {
                container := container
                eg.Go(func() error {
                        cntr, err := c.loadContainer(ctx2, container)
                        if err != nil {
                                log.G(ctx2).
                                        WithError(err).
                                        WithField("container", container.ID()).
                                        Error("Failed to load container")

                                return nil
                        }
                        log.G(ctx2).Debugf("Loaded container %+v", cntr)
                        if err := c.containerStore.Add(cntr); err != nil {
                                return fmt.Errorf("failed to add container %q to store: %w", container.ID(), err)
                        }
                        if err := c.containerNameIndex.Reserve(cntr.Name, cntr.ID); err != nil {
                                return fmt.Errorf("failed to reserve container name %q: %w", cntr.Name, err)
                        }
                        return nil
                })
        }
        if err := eg.Wait(); err != nil {
                return err
        }

        // Recover all images.
        if err := c.ImageService.CheckImages(ctx); err != nil {
                return fmt.Errorf("failed to check images: %w", err)
        }

        // It's possible that containerd containers are deleted unexpectedly. In that case,
        // we can't even get metadata, we should cleanup orphaned sandbox/container directories
        // with best effort.

        // Cleanup orphaned sandbox and container directories without corresponding containerd container.
        for _, cleanup := range []struct {
                cntrs  []containerd.Container
                base   string
                errMsg string
        }{
                {
                        cntrs:  sandboxes,
                        base:   filepath.Join(c.config.RootDir, sandboxesDir),
                        errMsg: "failed to cleanup orphaned sandbox directories",
                },
                {
                        cntrs:  sandboxes,
                        base:   filepath.Join(c.config.StateDir, sandboxesDir),
                        errMsg: "failed to cleanup orphaned volatile sandbox directories",
                },
                {
                        cntrs:  containers,
                        base:   filepath.Join(c.config.RootDir, containersDir),
                        errMsg: "failed to cleanup orphaned container directories",
                },
                {
                        cntrs:  containers,
                        base:   filepath.Join(c.config.StateDir, containersDir),
                        errMsg: "failed to cleanup orphaned volatile container directories",
                },
        } {
                if err := cleanupOrphanedIDDirs(ctx, cleanup.cntrs, cleanup.base); err != nil {
                        return fmt.Errorf("%s: %w", cleanup.errMsg, err)
                }
        }
        return nil
}

// loadContainerTimeout is the default timeout for loading a container/sandbox.
// One container/sandbox hangs (e.g. containerd#2438) should not affect other
// containers/sandboxes.
// Most CRI container/sandbox related operations are per container, the ones
// which handle multiple containers at a time are:
// * ListPodSandboxes: Don't talk with containerd services.
// * ListContainers: Don't talk with containerd services.
// * ListContainerStats: Not in critical code path, a default timeout will
// be applied at CRI level.
// * Recovery logic: We should set a time for each container/sandbox recovery.
// * Event monitor: We should set a timeout for each container/sandbox event handling.
const loadContainerTimeout = 10 * time.Second

// loadContainer loads container from containerd and status checkpoint.
func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) {
        ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
        defer cancel()
        id := cntr.ID()
        containerDir := c.getContainerRootDir(id)
        var container containerstore.Container
        // Load container metadata.
        exts, err := cntr.Extensions(ctx)
        if err != nil {
                return container, fmt.Errorf("failed to get container extensions: %w", err)
        }
        ext, ok := exts[crilabels.ContainerMetadataExtension]
        if !ok {
                return container, fmt.Errorf("metadata extension %q not found", crilabels.ContainerMetadataExtension)
        }
        data, err := typeurl.UnmarshalAny(ext)
        if err != nil {
                return container, fmt.Errorf("failed to unmarshal metadata extension %q: %w", ext, err)
        }
        meta := data.(*containerstore.Metadata)

        // Load status from checkpoint.
        status, err := containerstore.LoadStatus(containerDir, id)
        if err != nil {
                log.G(ctx).WithError(err).Warnf("Failed to load container status for %q", id)
                status = unknownContainerStatus()
        }

        var containerIO *cio.ContainerIO
        err = func() error {
                // Load up-to-date status from containerd.
                t, err := cntr.Task(ctx, func(fifos *containerdio.FIFOSet) (_ containerdio.IO, err error) {
                        stdoutWC, stderrWC, err := c.createContainerLoggers(meta.LogPath, meta.Config.GetTty())
                        if err != nil {
                                return nil, err
                        }
                        defer func() {
                                if err != nil {
                                        if stdoutWC != nil {
                                                stdoutWC.Close()
                                        }
                                        if stderrWC != nil {
                                                stderrWC.Close()
                                        }
                                }
                        }()
                        containerIO, err = cio.NewContainerIO(id,
                                cio.WithFIFOs(fifos),
                        )
                        if err != nil {
                                return nil, err
                        }
                        containerIO.AddOutput("log", stdoutWC, stderrWC)
                        containerIO.Pipe()
                        return containerIO, nil
                })
                if err != nil && !errdefs.IsNotFound(err) {
                        return fmt.Errorf("failed to load task: %w", err)
                }
                var s containerd.Status
                var notFound bool
                if errdefs.IsNotFound(err) {
                        // Task is not found.
                        notFound = true
                } else {
                        // Task is found. Get task status.
                        s, err = t.Status(ctx)
                        if err != nil {
                                // It's still possible that task is deleted during this window.
                                if !errdefs.IsNotFound(err) {
                                        return fmt.Errorf("failed to get task status: %w", err)
                                }
                                notFound = true
                        }
                }
                if notFound {
                        // Task is not created or has been deleted, use the checkpointed status
                        // to generate container status.
                        switch status.State() {
                        case runtime.ContainerState_CONTAINER_CREATED:
                                // NOTE: Another possibility is that we've tried to start the container, but
                                // containerd got restarted during that. In that case, we still
                                // treat the container as `CREATED`.
                                containerIO, err = c.createContainerIO(id, meta.SandboxID, meta.Config)
                                if err != nil {
                                        return fmt.Errorf("failed to create container io: %w", err)
                                }
                        case runtime.ContainerState_CONTAINER_RUNNING:
                                // Container was in running state, but its task has been deleted,
                                // set unknown exited state. Container io is not needed in this case.
                                status.FinishedAt = time.Now().UnixNano()
                                status.ExitCode = unknownExitCode
                                status.Reason = unknownExitReason
                        default:
                                // Container is in exited/unknown state, return the status as it is.
                        }
                } else {
                        // Task status is found. Update container status based on the up-to-date task status.
                        switch s.Status {
                        case containerd.Created:
                                // Task has been created, but not started yet. This could only happen if containerd
                                // gets restarted during container start.
                                // Container must be in `CREATED` state.
                                if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
                                        return fmt.Errorf("failed to delete task: %w", err)
                                }
                                if status.State() != runtime.ContainerState_CONTAINER_CREATED {
                                        return fmt.Errorf("unexpected container state for created task: %q", status.State())
                                }
                        case containerd.Running:
                                // Task is running. Container must be in `RUNNING` state, based on our assumption that
                                // "task should not be started when containerd is down".
                                switch status.State() {
                                case runtime.ContainerState_CONTAINER_EXITED:
                                        return fmt.Errorf("unexpected container state for running task: %q", status.State())
                                case runtime.ContainerState_CONTAINER_RUNNING:
                                default:
                                        // This may happen if containerd gets restarted after task is started, but
                                        // before status is checkpointed.
                                        status.StartedAt = time.Now().UnixNano()
                                        status.Pid = t.Pid()
                                }
                                // Wait for the task for exit monitor.
                                // wait is a long running background request, no timeout needed.
                                exitCh, err := t.Wait(ctrdutil.NamespacedContext())
                                if err != nil {
                                        if !errdefs.IsNotFound(err) {
                                                return fmt.Errorf("failed to wait for task: %w", err)
                                        }
                                        // Container was in running state, but its task has been deleted,
                                        // set unknown exited state.
                                        status.FinishedAt = time.Now().UnixNano()
                                        status.ExitCode = unknownExitCode
                                        status.Reason = unknownExitReason
                                } else {
                                        // Start exit monitor.
                                        c.startContainerExitMonitor(context.Background(), id, status.Pid, exitCh)
                                }
                        case containerd.Stopped:
                                // Task is stopped. Update status and delete the task.
                                if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
                                        return fmt.Errorf("failed to delete task: %w", err)
                                }
                                status.FinishedAt = s.ExitTime.UnixNano()
                                status.ExitCode = int32(s.ExitStatus)
                        default:
                                return fmt.Errorf("unexpected task status %q", s.Status)
                        }
                }
                return nil
        }()
        if err != nil {
                log.G(ctx).WithError(err).Errorf("Failed to load container status for %q", id)
                // Only set the unknown field in this case, because other fields may
                // contain useful information loaded from the checkpoint.
                status.Unknown = true
        }
        opts := []containerstore.Opts{
                containerstore.WithStatus(status, containerDir),
                containerstore.WithContainer(cntr),
        }
        // containerIO could be nil for container in unknown state.
        if containerIO != nil {
                opts = append(opts, containerstore.WithContainerIO(containerIO))
        }
        return containerstore.NewContainer(*meta, opts...)
}

// podSandboxRecover is an additional interface implemented by podsandbox/ controller to handle
// Pod sandbox containers recovery.
type podSandboxRecover interface {
        RecoverContainer(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error)
}

func getNetNS(meta *sandboxstore.Metadata) *netns.NetNS {
        // Don't need to load netns for host network sandbox.
        if hostNetwork(meta.Config) {
                return nil
        }
        return netns.LoadNetNS(meta.NetNSPath)
}

func cleanupOrphanedIDDirs(ctx context.Context, cntrs []containerd.Container, base string) error {
        // Cleanup orphaned id directories.
        dirs, err := os.ReadDir(base)
        if err != nil && !os.IsNotExist(err) {
                return fmt.Errorf("failed to read base directory: %w", err)
        }
        idsMap := make(map[string]containerd.Container)
        for _, cntr := range cntrs {
                idsMap[cntr.ID()] = cntr
        }
        for _, d := range dirs {
                if !d.IsDir() {
                        log.G(ctx).Warnf("Invalid file %q found in base directory %q", d.Name(), base)
                        continue
                }
                if _, ok := idsMap[d.Name()]; ok {
                        // Do not remove id directory if corresponding container is found.
                        continue
                }
                dir := filepath.Join(base, d.Name())
                if err := ensureRemoveAll(ctx, dir); err != nil {
                        log.G(ctx).WithError(err).Warnf("Failed to remove id directory %q", dir)
                } else {
                        log.G(ctx).Debugf("Cleanup orphaned id directory %q", dir)
                }
        }
        return nil
}

func (c *criService) createContainerIO(containerID, sandboxID string, config *runtime.ContainerConfig) (*cio.ContainerIO, error) {
        if config == nil {
                return nil, fmt.Errorf("ContainerConfig should not be nil when create container io")
        }
        sb, err := c.sandboxStore.Get(sandboxID)
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find sandbox %q: %w", sandboxID, err)
        }
        ociRuntime, err := c.config.GetSandboxRuntime(sb.Config, sb.Metadata.RuntimeHandler)
        if err != nil {
                return nil, fmt.Errorf("failed to get sandbox runtime: %w", err)
        }
        var containerIO *cio.ContainerIO
        switch ociRuntime.IOType {
        case criconfig.IOTypeStreaming:
                containerIO, err = cio.NewContainerIO(containerID,
                        cio.WithStreams(sb.Endpoint.Address, config.GetTty(), config.GetStdin()))
        default:
                volatileContainerRootDir := c.getVolatileContainerRootDir(containerID)
                containerIO, err = cio.NewContainerIO(containerID,
                        cio.WithNewFIFOs(volatileContainerRootDir, config.GetTty(), config.GetStdin()))
        }
        if err != nil {
                return nil, fmt.Errorf("failed to create container io: %w", err)
        }
        return containerIO, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// RuntimeConfig returns configuration information of the runtime.
func (c *criService) RuntimeConfig(ctx context.Context, r *runtime.RuntimeConfigRequest) (*runtime.RuntimeConfigResponse, error) {
        resp := &runtime.RuntimeConfigResponse{
                Linux: c.getLinuxRuntimeConfig(ctx),
        }
        return resp, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "sort"

        runcoptions "github.com/containerd/containerd/api/types/runc/options"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/systemd"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (c *criService) getLinuxRuntimeConfig(ctx context.Context) *runtime.LinuxRuntimeConfiguration {
        return &runtime.LinuxRuntimeConfiguration{CgroupDriver: c.getCgroupDriver(ctx)}
}

func (c *criService) getCgroupDriver(ctx context.Context) runtime.CgroupDriver {
        // Go through the runtime handlers in a predictable order, starting from the
        // default handler, others sorted in alphabetical order
        handlerNames := make([]string, 0, len(c.config.ContainerdConfig.Runtimes))
        for n := range c.config.ContainerdConfig.Runtimes {
                handlerNames = append(handlerNames, n)
        }
        sort.Slice(handlerNames, func(i, j int) bool {
                if handlerNames[i] == c.config.ContainerdConfig.DefaultRuntimeName {
                        return true
                }
                if handlerNames[j] == c.config.ContainerdConfig.DefaultRuntimeName {
                        return false
                }
                return handlerNames[i] < handlerNames[j]
        })

        for _, handler := range handlerNames {
                opts, err := criconfig.GenerateRuntimeOptions(c.config.ContainerdConfig.Runtimes[handler])
                if err != nil {
                        log.G(ctx).Debugf("failed to parse runtime handler options for %q", handler)
                        continue
                }
                if d, ok := getCgroupDriverFromRuntimeHandlerOpts(opts); ok {
                        return d
                }
                log.G(ctx).Debugf("runtime handler %q does not provide cgroup driver information", handler)
        }

        // If no runtime handlers have a setting, detect if systemd is running
        d := runtime.CgroupDriver_CGROUPFS
        if systemd.IsRunningSystemd() {
                d = runtime.CgroupDriver_SYSTEMD
        }
        log.G(ctx).Debugf("no runtime handler provided cgroup driver setting, using auto-detected %s", runtime.CgroupDriver_name[int32(d)])
        return d
}

func getCgroupDriverFromRuntimeHandlerOpts(opts interface{}) (runtime.CgroupDriver, bool) {
        switch v := opts.(type) {
        case *runcoptions.Options:
                systemdCgroup := v.SystemdCgroup
                if systemdCgroup {
                        return runtime.CgroupDriver_SYSTEMD, true
                }
                return runtime.CgroupDriver_CGROUPFS, true
        }
        return runtime.CgroupDriver_SYSTEMD, false
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "time"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
)

// ListPodSandbox returns a list of Sandbox.
func (c *criService) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandboxRequest) (*runtime.ListPodSandboxResponse, error) {
        start := time.Now()
        // List all sandboxes from store.
        sandboxesInStore := c.sandboxStore.List()
        var sandboxes []*runtime.PodSandbox
        for _, sandboxInStore := range sandboxesInStore {
                sandboxes = append(sandboxes, toCRISandbox(
                        sandboxInStore.Metadata,
                        sandboxInStore.Status.Get(),
                ))
        }

        sandboxes = c.filterCRISandboxes(sandboxes, r.GetFilter())

        sandboxListTimer.UpdateSince(start)
        return &runtime.ListPodSandboxResponse{Items: sandboxes}, nil
}

// toCRISandbox converts sandbox metadata into CRI pod sandbox.
func toCRISandbox(meta sandboxstore.Metadata, status sandboxstore.Status) *runtime.PodSandbox {
        // Set sandbox state to NOTREADY by default.
        state := runtime.PodSandboxState_SANDBOX_NOTREADY
        if status.State == sandboxstore.StateReady {
                state = runtime.PodSandboxState_SANDBOX_READY
        }
        return &runtime.PodSandbox{
                Id:             meta.ID,
                Metadata:       meta.Config.GetMetadata(),
                State:          state,
                CreatedAt:      status.CreatedAt.UnixNano(),
                Labels:         meta.Config.GetLabels(),
                Annotations:    meta.Config.GetAnnotations(),
                RuntimeHandler: meta.RuntimeHandler,
        }
}

func (c *criService) normalizePodSandboxFilter(filter *runtime.PodSandboxFilter) {
        if sb, err := c.sandboxStore.Get(filter.GetId()); err == nil {
                filter.Id = sb.ID
        }
}

func (c *criService) normalizePodSandboxStatsFilter(filter *runtime.PodSandboxStatsFilter) {
        if sb, err := c.sandboxStore.Get(filter.GetId()); err == nil {
                filter.Id = sb.ID
        }
}

// filterCRISandboxes filters CRISandboxes.
func (c *criService) filterCRISandboxes(sandboxes []*runtime.PodSandbox, filter *runtime.PodSandboxFilter) []*runtime.PodSandbox {
        if filter == nil {
                return sandboxes
        }

        c.normalizePodSandboxFilter(filter)
        filtered := []*runtime.PodSandbox{}
        for _, s := range sandboxes {
                // Filter by id
                if filter.GetId() != "" && filter.GetId() != s.Id {
                        continue
                }
                // Filter by state
                if filter.GetState() != nil && filter.GetState().GetState() != s.State {
                        continue
                }
                // Filter by label
                if filter.GetLabelSelector() != nil {
                        match := true
                        for k, v := range filter.GetLabelSelector() {
                                got, ok := s.Labels[k]
                                if !ok || got != v {
                                        match = false
                                        break
                                }
                        }
                        if !match {
                                continue
                        }
                }
                filtered = append(filtered, s)
        }

        return filtered
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
)

// PortForward prepares a streaming endpoint to forward ports from a PodSandbox, and returns the address.
func (c *criService) PortForward(ctx context.Context, r *runtime.PortForwardRequest) (retRes *runtime.PortForwardResponse, retErr error) {
        sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
        if err != nil {
                return nil, fmt.Errorf("failed to find sandbox %q: %w", r.GetPodSandboxId(), err)
        }
        if sandbox.Status.Get().State != sandboxstore.StateReady {
                return nil, errors.New("sandbox container is not running")
        }
        // TODO(random-liu): Verify that ports are exposed.
        return c.streamServer.GetPortForward(r)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "io"
        "net"
        "time"

        "github.com/containerd/log"
        "github.com/containernetworking/plugins/pkg/ns"
)

// portForward uses netns to enter the sandbox namespace, and forwards a stream inside the
// namespace to a specific port. It keeps forwarding until it exits or client disconnect.
func (c *criService) portForward(ctx context.Context, id string, port int32, stream io.ReadWriteCloser) error {
        s, err := c.sandboxStore.Get(id)
        if err != nil {
                return fmt.Errorf("failed to find sandbox %q in store: %w", id, err)
        }

        var (
                netNSDo func(func(ns.NetNS) error) error
                // netNSPath is the network namespace path for logging.
                netNSPath string
        )
        if !hostNetwork(s.Config) {
                if closed, err := s.NetNS.Closed(); err != nil {
                        return fmt.Errorf("failed to check netwok namespace closed for sandbox %q: %w", id, err)
                } else if closed {
                        return fmt.Errorf("network namespace for sandbox %q is closed", id)
                }
                netNSDo = s.NetNS.Do
                netNSPath = s.NetNS.GetPath()
        } else {
                // Run the function directly for host network.
                netNSDo = func(do func(_ ns.NetNS) error) error {
                        return do(nil)
                }
                netNSPath = "host"
        }

        log.G(ctx).Infof("Executing port forwarding in network namespace %q", netNSPath)
        err = netNSDo(func(_ ns.NetNS) error {
                defer stream.Close()
                // localhost can resolve to both IPv4 and IPv6 addresses in dual-stack systems
                // but the application can be listening in one of the IP families only.
                // golang has enabled RFC 6555 Fast Fallback (aka HappyEyeballs) by default in 1.12
                // It means that if a host resolves to both IPv6 and IPv4, it will try to connect to any
                // of those addresses and use the working connection.
                // However, the implementation uses goroutines to start both connections in parallel,
                // and this cases that the connection is done outside the namespace, so we try to connect
                // serially.
                // We try IPv4 first to keep current behavior and we fallback to IPv6 if the connection fails.
                // xref https://github.com/golang/go/issues/44922
                var conn net.Conn
                conn, err := net.Dial("tcp4", fmt.Sprintf("localhost:%d", port))
                if err != nil {
                        var errV6 error
                        conn, errV6 = net.Dial("tcp6", fmt.Sprintf("localhost:%d", port))
                        if errV6 != nil {
                                return fmt.Errorf("failed to connect to localhost:%d inside namespace %q, IPv4: %v IPv6 %v ", port, id, err, errV6)
                        }
                }
                defer conn.Close()

                errCh := make(chan error, 2)
                // Copy from the namespace port connection to the client stream
                go func() {
                        log.G(ctx).Debugf("PortForward copying data from namespace %q port %d to the client stream", id, port)
                        _, err := io.Copy(stream, conn)
                        errCh <- err
                }()

                // Copy from the client stream to the namespace port connection
                go func() {
                        log.G(ctx).Debugf("PortForward copying data from client stream to namespace %q port %d", id, port)
                        _, err := io.Copy(conn, stream)
                        errCh <- err
                }()

                // Wait until the first error is returned by one of the connections
                // we use errFwd to store the result of the port forwarding operation
                // if the context is cancelled close everything and return
                var errFwd error
                select {
                case errFwd = <-errCh:
                        log.G(ctx).Debugf("PortForward stop forwarding in one direction in network namespace %q port %d: %v", id, port, errFwd)
                case <-ctx.Done():
                        log.G(ctx).Debugf("PortForward cancelled in network namespace %q port %d: %v", id, port, ctx.Err())
                        return ctx.Err()
                }
                // give a chance to terminate gracefully or timeout
                // after 1s
                // https://linux.die.net/man/1/socat
                const timeout = time.Second
                select {
                case e := <-errCh:
                        if errFwd == nil {
                                errFwd = e
                        }
                        log.G(ctx).Debugf("PortForward stopped forwarding in both directions in network namespace %q port %d: %v", id, port, e)
                case <-time.After(timeout):
                        log.G(ctx).Debugf("PortForward timed out waiting to close the connection in network namespace %q port %d", id, port)
                case <-ctx.Done():
                        log.G(ctx).Debugf("PortForward cancelled in network namespace %q port %d: %v", id, port, ctx.Err())
                        errFwd = ctx.Err()
                }

                return errFwd
        })

        if err != nil {
                return fmt.Errorf("failed to execute portforward in network namespace %q: %w", netNSPath, err)
        }
        log.G(ctx).Infof("Finish port forwarding for %q port %d", id, port)

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// RemovePodSandbox removes the sandbox. If there are running containers in the
// sandbox, they should be forcibly removed.
func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodSandboxRequest) (*runtime.RemovePodSandboxResponse, error) {
        start := time.Now()
        sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("an error occurred when try to find sandbox %q: %w",
                                r.GetPodSandboxId(), err)
                }
                // Do not return error if the id doesn't exist.
                log.G(ctx).Tracef("RemovePodSandbox called for sandbox %q that does not exist",
                        r.GetPodSandboxId())
                return &runtime.RemovePodSandboxResponse{}, nil
        }
        // Use the full sandbox id.
        id := sandbox.ID

        // If the sandbox is still running, not ready, or in an unknown state, forcibly stop it.
        // Even if it's in a NotReady state, this will close its network namespace, if open.
        // This can happen if the task process associated with the Pod died or it was killed.
        log.G(ctx).Infof("Forcibly stopping sandbox %q", id)
        if err := c.stopPodSandbox(ctx, sandbox); err != nil {
                return nil, fmt.Errorf("failed to forcibly stop sandbox %q: %w", id, err)
        }

        // Return error if sandbox network namespace is not closed yet.
        if sandbox.NetNS != nil {
                nsPath := sandbox.NetNS.GetPath()
                if closed, err := sandbox.NetNS.Closed(); err != nil {
                        return nil, fmt.Errorf("failed to check sandbox network namespace %q closed: %w", nsPath, err)
                } else if !closed {
                        return nil, fmt.Errorf("sandbox network namespace %q is not fully closed", nsPath)
                }
        }

        // Remove all containers inside the sandbox.
        // NOTE(random-liu): container could still be created after this point, Kubelet should
        // not rely on this behavior.
        // TODO(random-liu): Introduce an intermediate state to avoid container creation after
        // this point.
        cntrs := c.containerStore.List()
        for _, cntr := range cntrs {
                if cntr.SandboxID != id {
                        continue
                }
                _, err = c.RemoveContainer(ctx, &runtime.RemoveContainerRequest{ContainerId: cntr.ID})
                if err != nil {
                        return nil, fmt.Errorf("failed to remove container %q: %w", cntr.ID, err)
                }
        }

        if err := c.sandboxService.ShutdownSandbox(ctx, sandbox.Sandboxer, id); err != nil && !errdefs.IsNotFound(err) {
                return nil, fmt.Errorf("failed to delete sandbox %q: %w", id, err)
        }

        // Send CONTAINER_DELETED event with ContainerId equal to SandboxId.
        c.generateAndSendContainerEvent(ctx, id, id, runtime.ContainerEventType_CONTAINER_DELETED_EVENT)

        err = c.nri.RemovePodSandbox(ctx, &sandbox)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("NRI pod removal notification failed")
        }

        // Remove sandbox from sandbox store. Note that once the sandbox is successfully
        // deleted:
        // 1) ListPodSandbox will not include this sandbox.
        // 2) PodSandboxStatus and StopPodSandbox will return error.
        // 3) On-going operations which have held the reference will not be affected.
        c.sandboxStore.Delete(id)

        if err := c.client.SandboxStore().Delete(ctx, id); err != nil {
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("failed to remove sandbox metadata from store: %w", err)
                }
                log.G(ctx).WithError(err).Warnf("failed to delete sandbox metadata from store: %q maybe recovered from v1.x release", id)
        }

        // Release the sandbox name reserved for the sandbox.
        c.sandboxNameIndex.ReleaseByKey(id)

        sandboxRemoveTimer.WithValues(sandbox.RuntimeHandler).UpdateSince(start)

        return &runtime.RemovePodSandboxResponse{}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "math"
        "path/filepath"
        "strings"
        "time"

        "github.com/containerd/go-cni"
        "github.com/containerd/log"
        "github.com/containerd/typeurl/v2"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        sb "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/annotations"
        "github.com/containerd/containerd/v2/internal/cri/bandwidth"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/server/podsandbox"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/pkg/netns"
)

func init() {
        typeurl.Register(&sandboxstore.Metadata{},
                "github.com/containerd/cri/pkg/store/sandbox", "Metadata")
}

// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
// the sandbox is in ready state.
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
        config := r.GetConfig()
        log.G(ctx).Debugf("Sandbox config %+v", config)

        // Generate unique id and name for the sandbox and reserve the name.
        id := util.GenerateID()
        metadata := config.GetMetadata()
        if metadata == nil {
                return nil, errors.New("sandbox config must include metadata")
        }
        name := makeSandboxName(metadata)
        log.G(ctx).WithField("podsandboxid", id).Debugf("generated id for sandbox name %q", name)

        // cleanupErr records the last error returned by the critical cleanup operations in deferred functions,
        // like CNI teardown and stopping the running sandbox task.
        // If cleanup is not completed for some reason, the CRI-plugin will leave the sandbox
        // in a not-ready state, which can later be cleaned up by the next execution of the kubelet's syncPod workflow.
        var cleanupErr error

        // Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the
        // same sandbox.
        if err := c.sandboxNameIndex.Reserve(name, id); err != nil {
                return nil, fmt.Errorf("failed to reserve sandbox name %q: %w", name, err)
        }
        defer func() {
                // Release the name if the function returns with an error.
                // When cleanupErr != nil, the name will be cleaned in sandbox_remove.
                if retErr != nil && cleanupErr == nil {
                        c.sandboxNameIndex.ReleaseByName(name)
                }
        }()

        var (
                err         error
                sandboxInfo = sb.Sandbox{ID: id}
        )

        ociRuntime, err := c.config.GetSandboxRuntime(config, r.GetRuntimeHandler())
        if err != nil {
                return nil, fmt.Errorf("unable to get OCI runtime for sandbox %q: %w", id, err)
        }

        sandboxInfo.Runtime.Name = ociRuntime.Type
        sandboxInfo.Sandboxer = ociRuntime.Sandboxer

        runtimeStart := time.Now()
        // Retrieve runtime options
        runtimeOpts, err := criconfig.GenerateRuntimeOptions(ociRuntime)
        if err != nil {
                return nil, fmt.Errorf("failed to generate sandbox runtime options: %w", err)
        }

        if runtimeOpts != nil {
                sandboxInfo.Runtime.Options, err = typeurl.MarshalAny(runtimeOpts)
                if err != nil {
                        return nil, fmt.Errorf("failed to marshal runtime options: %w", err)
                }
        }

        // Save sandbox name
        sandboxInfo.AddLabel("name", name)

        // Create initial internal sandbox object.
        sandbox := sandboxstore.NewSandbox(
                sandboxstore.Metadata{
                        ID:             id,
                        Name:           name,
                        Config:         config,
                        RuntimeHandler: r.GetRuntimeHandler(),
                },
                sandboxstore.Status{
                        State:     sandboxstore.StateUnknown,
                        CreatedAt: time.Now().UTC(),
                },
        )
        sandbox.Sandboxer = ociRuntime.Sandboxer

        if _, err := c.client.SandboxStore().Create(ctx, sandboxInfo); err != nil {
                return nil, fmt.Errorf("failed to save sandbox metadata: %w", err)
        }
        defer func() {
                if retErr != nil && cleanupErr == nil {
                        cleanupErr = c.client.SandboxStore().Delete(ctx, id)
                }
        }()

        defer func() {
                // Put the sandbox into sandbox store when some resources fail to be cleaned.
                if retErr != nil && cleanupErr != nil {
                        log.G(ctx).WithError(cleanupErr).Errorf("encountered an error cleaning up failed sandbox %q, marking sandbox state as SANDBOX_UNKNOWN", id)
                        if err := c.sandboxStore.Add(sandbox); err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to add sandbox %+v into store", sandbox)
                        }
                }
        }()

        // XXX: What we really want here is to call controller.Platform() and then check
        // platform.OS, but that is only populated after controller.Create() and that needs to be
        // done later (uses sandbox.NSPath that we will set just _after_ this).
        // So, lets check for the Linux section on the config, if that is populated, we assume the
        // platform is linux.
        // This is a hack, we should improve the controller interface to return the platform
        // earlier. But should work fine for this specific use.
        userNsEnabled := false
        if linux := config.GetLinux(); linux != nil {
                usernsOpts := linux.GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
                if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD {
                        userNsEnabled = true
                }
        }

        // Setup the network namespace if host networking wasn't requested.
        if !hostNetwork(config) && !userNsEnabled {
                // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
                // We can't move this to a function, as the defer calls need to be executed if other
                // errors are returned in this function. So, we would need more refactors to move
                // this code to a function and the idea was to not change the current code for
                // !userNsEnabled case, therefore doing it would defeat the purpose.
                //
                // The difference between the cases is the use of netns.NewNetNS() vs
                // netns.NewNetNSFromPID().
                //
                // To simplify this, in the future, we should just remove this case (podNetwork &&
                // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
                netStart := time.Now()
                // If it is not in host network namespace then create a namespace and set the sandbox
                // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
                // namespaces. If the pod is in host network namespace then both are empty and should not
                // be used.
                var netnsMountDir = "/var/run/netns"
                if c.config.NetNSMountsUnderStateDir {
                        netnsMountDir = filepath.Join(c.config.StateDir, "netns")
                }
                sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
                if err != nil {
                        return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
                }
                // Update network namespace in the store, which is used to generate the container's spec
                sandbox.NetNSPath = sandbox.NetNS.GetPath()
                defer func() {
                        // Remove the network namespace only if all the resource cleanup is done
                        if retErr != nil && cleanupErr == nil {
                                if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
                                        log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
                                        return
                                }
                                sandbox.NetNSPath = ""
                        }
                }()

                if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
                        return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
                }
                // Save sandbox metadata to store
                if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
                        return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
                }

                // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
                // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
                // creation functions.
                defer func() {
                        // Remove the network namespace only if all the resource cleanup is done.
                        if retErr != nil && cleanupErr == nil {
                                deferCtx, deferCancel := util.DeferContext()
                                defer deferCancel()
                                // Teardown network if an error is returned.
                                if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
                                        log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
                                }

                        }
                }()

                // Setup network for sandbox.
                // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
                // rely on the assumption that CRI shim will not be querying the network namespace to check the
                // network states such as IP.
                // In future runtime implementation should avoid relying on CRI shim implementation details.
                // In this case however caching the IP will add a subtle performance enhancement by avoiding
                // calls to network namespace of the pod to query the IP of the veth interface on every
                // SandboxStatus request.
                if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
                        return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
                }
                sandboxCreateNetworkTimer.UpdateSince(netStart)
        }

        if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
                return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
        }

        // Save sandbox metadata to store
        if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
                return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
        }

        if err := c.sandboxService.CreateSandbox(ctx, sandboxInfo, sb.WithOptions(config), sb.WithNetNSPath(sandbox.NetNSPath)); err != nil {
                return nil, fmt.Errorf("failed to create sandbox %q: %w", id, err)
        }

        ctrl, err := c.sandboxService.StartSandbox(ctx, sandbox.Sandboxer, id)
        if err != nil {
                var cerr podsandbox.CleanupErr
                if errors.As(err, &cerr) {
                        cleanupErr = fmt.Errorf("failed to cleanup sandbox: %w", cerr)

                        // Strip last error as cleanup error to handle separately
                        if merr, ok := err.(interface{ Unwrap() []error }); ok {
                                if errs := merr.Unwrap(); len(errs) > 0 {
                                        err = errs[0]
                                }
                        }
                }
                return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err)
        }

        if ctrl.Address != "" {
                sandbox.Endpoint = sandboxstore.Endpoint{
                        Version: ctrl.Version,
                        Address: ctrl.Address,
                }
        }

        if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
                return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
        }

        if !hostNetwork(config) && userNsEnabled {
                // If userns is enabled, then the netns was created by the OCI runtime
                // on controller.Start(). The OCI runtime needs to create the netns
                // because, if userns is in use, the netns needs to be owned by the
                // userns. So, let the OCI runtime just handle this for us.
                // If the netns is not owned by the userns several problems will happen.
                // For instance, the container will lack permission (even if
                // capabilities are present) to modify the netns or, even worse, the OCI
                // runtime will fail to mount sysfs:
                //      https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
                //
                // Note we do this after controller.Start(), as before that we
                // can't get the PID for the sandbox that we need for the netns.
                // Doing a controller.Status() call before that fails (can't
                // find the sandbox) so we can't get the PID.
                netStart := time.Now()

                // If it is not in host network namespace then create a namespace and set the sandbox
                // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
                // namespaces. If the pod is in host network namespace then both are empty and should not
                // be used.
                var netnsMountDir = "/var/run/netns"
                if c.config.NetNSMountsUnderStateDir {
                        netnsMountDir = filepath.Join(c.config.StateDir, "netns")
                }

                sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid)
                if err != nil {
                        return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
                }

                // Update network namespace in the store, which is used to generate the container's spec
                sandbox.NetNSPath = sandbox.NetNS.GetPath()
                defer func() {
                        // Remove the network namespace only if all the resource cleanup is done
                        if retErr != nil && cleanupErr == nil {
                                if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
                                        log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
                                        return
                                }
                                sandbox.NetNSPath = ""
                        }
                }()

                if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
                        return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
                }
                // Save sandbox metadata to store
                if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
                        return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
                }

                // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
                // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
                // creation functions.
                defer func() {
                        // Remove the network namespace only if all the resource cleanup is done.
                        if retErr != nil && cleanupErr == nil {
                                deferCtx, deferCancel := util.DeferContext()
                                defer deferCancel()
                                // Teardown network if an error is returned.
                                if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
                                        log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
                                }

                        }
                }()

                // Setup network for sandbox.
                // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
                // rely on the assumption that CRI shim will not be querying the network namespace to check the
                // network states such as IP.
                // In future runtime implementation should avoid relying on CRI shim implementation details.
                // In this case however caching the IP will add a subtle performance enhancement by avoiding
                // calls to network namespace of the pod to query the IP of the veth interface on every
                // SandboxStatus request.
                if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
                        return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
                }
                sandboxCreateNetworkTimer.UpdateSince(netStart)
        }

        // TODO: get rid of this. sandbox object should no longer have Container field.
        if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) {
                container, err := c.client.LoadContainer(ctx, id)
                if err != nil {
                        return nil, fmt.Errorf("failed to load container %q for sandbox: %w", id, err)
                }
                sandbox.Container = container
        }

        labels := ctrl.Labels
        if labels == nil {
                labels = map[string]string{}
        }

        sandbox.ProcessLabel = labels["selinux_label"]

        err = c.nri.RunPodSandbox(ctx, &sandbox)
        if err != nil {
                return nil, fmt.Errorf("NRI RunPodSandbox failed: %w", err)
        }

        defer func() {
                if retErr != nil {
                        deferCtx, deferCancel := util.DeferContext()
                        defer deferCancel()
                        c.nri.RemovePodSandbox(deferCtx, &sandbox)
                }
        }()

        if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
                // Set the pod sandbox as ready after successfully start sandbox container.
                status.Pid = ctrl.Pid
                status.State = sandboxstore.StateReady
                status.CreatedAt = ctrl.CreatedAt
                return status, nil
        }); err != nil {
                return nil, fmt.Errorf("failed to update sandbox status: %w", err)
        }

        // Add sandbox into sandbox store in INIT state.
        if err := c.sandboxStore.Add(sandbox); err != nil {
                return nil, fmt.Errorf("failed to add sandbox %+v into store: %w", sandbox, err)
        }

        // Send CONTAINER_CREATED event with both ContainerId and SandboxId equal to SandboxId.
        // Note that this has to be done after sandboxStore.Add() because we need to get
        // SandboxStatus from the store and include it in the event.
        c.generateAndSendContainerEvent(ctx, id, id, runtime.ContainerEventType_CONTAINER_CREATED_EVENT)

        exitCh, err := c.sandboxService.WaitSandbox(util.NamespacedContext(), sandbox.Sandboxer, id)
        if err != nil {
                return nil, fmt.Errorf("failed to wait sandbox %s: %v", id, err)
        }

        // start the monitor after adding sandbox into the store, this ensures
        // that sandbox is in the store, when event monitor receives the TaskExit event.
        //
        // TaskOOM from containerd may come before sandbox is added to store,
        // but we don't care about sandbox TaskOOM right now, so it is fine.
        c.startSandboxExitMonitor(context.Background(), id, exitCh)

        // Send CONTAINER_STARTED event with ContainerId equal to SandboxId.
        c.generateAndSendContainerEvent(ctx, id, id, runtime.ContainerEventType_CONTAINER_STARTED_EVENT)

        sandboxRuntimeCreateTimer.WithValues(labels["oci_runtime_type"]).UpdateSince(runtimeStart)

        return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil
}

// getNetworkPlugin returns the network plugin to be used by the runtime class
// defaults to the global CNI options in the CRI config
func (c *criService) getNetworkPlugin(runtimeClass string) cni.CNI {
        if c.netPlugin == nil {
                return nil
        }
        i, ok := c.netPlugin[runtimeClass]
        if !ok {
                if i, ok = c.netPlugin[defaultNetworkPlugin]; !ok {
                        return nil
                }
        }
        return i
}

// setupPodNetwork setups up the network for a pod
func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore.Sandbox) error {
        var (
                id        = sandbox.ID
                config    = sandbox.Config
                path      = sandbox.NetNSPath
                netPlugin = c.getNetworkPlugin(sandbox.RuntimeHandler)
                err       error
                result    *cni.Result
        )
        if netPlugin == nil {
                return errors.New("cni config not initialized")
        }

        opts, err := cniNamespaceOpts(id, config)
        if err != nil {
                return fmt.Errorf("get cni namespace options: %w", err)
        }
        log.G(ctx).WithField("podsandboxid", id).Debugf("begin cni setup")
        netStart := time.Now()
        if c.config.CniConfig.NetworkPluginSetupSerially {
                result, err = netPlugin.SetupSerially(ctx, id, path, opts...)
        } else {
                result, err = netPlugin.Setup(ctx, id, path, opts...)
        }
        networkPluginOperations.WithValues(networkSetUpOp).Inc()
        networkPluginOperationsLatency.WithValues(networkSetUpOp).UpdateSince(netStart)
        if err != nil {
                networkPluginOperationsErrors.WithValues(networkSetUpOp).Inc()
                return err
        }
        logDebugCNIResult(ctx, id, result)
        // Check if the default interface has IP config
        if configs, ok := result.Interfaces[defaultIfName]; ok && len(configs.IPConfigs) > 0 {
                sandbox.IP, sandbox.AdditionalIPs = selectPodIPs(ctx, configs.IPConfigs, c.config.IPPreference)
                sandbox.CNIResult = result
                return nil
        }
        return fmt.Errorf("failed to find network info for sandbox %q", id)
}

// cniNamespaceOpts get CNI namespace options from sandbox config.
func cniNamespaceOpts(id string, config *runtime.PodSandboxConfig) ([]cni.NamespaceOpts, error) {
        opts := []cni.NamespaceOpts{
                cni.WithLabels(toCNILabels(id, config)),
                cni.WithCapability(annotations.PodAnnotations, config.Annotations),
        }

        portMappings := toCNIPortMappings(config.GetPortMappings())
        if len(portMappings) > 0 {
                opts = append(opts, cni.WithCapabilityPortMap(portMappings))
        }

        // Will return an error if the bandwidth limitation has the wrong unit
        // or an unreasonable value see validateBandwidthIsReasonable()
        bandWidth, err := toCNIBandWidth(config.Annotations)
        if err != nil {
                return nil, err
        }
        if bandWidth != nil {
                opts = append(opts, cni.WithCapabilityBandWidth(*bandWidth))
        }

        dns := toCNIDNS(config.GetDnsConfig())
        if dns != nil {
                opts = append(opts, cni.WithCapabilityDNS(*dns))
        }

        if cgroup := config.GetLinux().GetCgroupParent(); cgroup != "" {
                opts = append(opts, cni.WithCapabilityCgroupPath(cgroup))
        }

        return opts, nil
}

// toCNILabels adds pod metadata into CNI labels.
func toCNILabels(id string, config *runtime.PodSandboxConfig) map[string]string {
        return map[string]string{
                "K8S_POD_NAMESPACE":          config.GetMetadata().GetNamespace(),
                "K8S_POD_NAME":               config.GetMetadata().GetName(),
                "K8S_POD_INFRA_CONTAINER_ID": id,
                "K8S_POD_UID":                config.GetMetadata().GetUid(),
                "IgnoreUnknown":              "1",
        }
}

// toCNIBandWidth converts CRI annotations to CNI bandwidth.
func toCNIBandWidth(annotations map[string]string) (*cni.BandWidth, error) {
        ingress, egress, err := bandwidth.ExtractPodBandwidthResources(annotations)
        if err != nil {
                return nil, fmt.Errorf("reading pod bandwidth annotations: %w", err)
        }

        if ingress == nil && egress == nil {
                return nil, nil
        }

        bandWidth := &cni.BandWidth{}

        if ingress != nil {
                bandWidth.IngressRate = uint64(ingress.Value())
                bandWidth.IngressBurst = math.MaxUint32
        }

        if egress != nil {
                bandWidth.EgressRate = uint64(egress.Value())
                bandWidth.EgressBurst = math.MaxUint32
        }

        return bandWidth, nil
}

// toCNIPortMappings converts CRI port mappings to CNI.
func toCNIPortMappings(criPortMappings []*runtime.PortMapping) []cni.PortMapping {
        var portMappings []cni.PortMapping
        for _, mapping := range criPortMappings {
                if mapping.HostPort <= 0 {
                        continue
                }
                portMappings = append(portMappings, cni.PortMapping{
                        HostPort:      mapping.HostPort,
                        ContainerPort: mapping.ContainerPort,
                        Protocol:      strings.ToLower(mapping.Protocol.String()),
                        HostIP:        mapping.HostIp,
                })
        }
        return portMappings
}

// toCNIDNS converts CRI DNSConfig to CNI.
func toCNIDNS(dns *runtime.DNSConfig) *cni.DNS {
        if dns == nil {
                return nil
        }
        return &cni.DNS{
                Servers:  dns.GetServers(),
                Searches: dns.GetSearches(),
                Options:  dns.GetOptions(),
        }
}

// selectPodIPs select an ip from the ip list.
func selectPodIPs(ctx context.Context, configs []*cni.IPConfig, preference string) (string, []string) {
        if len(configs) == 1 {
                return ipString(configs[0]), nil
        }
        toStrings := func(ips []*cni.IPConfig) (o []string) {
                for _, i := range ips {
                        o = append(o, ipString(i))
                }
                return o
        }
        var extra []string
        switch preference {
        default:
                if preference != "ipv4" && preference != "" {
                        log.G(ctx).WithField("ip_pref", preference).Warn("invalid ip_pref, falling back to ipv4")
                }
                for i, ip := range configs {
                        if ip.IP.To4() != nil {
                                return ipString(ip), append(extra, toStrings(configs[i+1:])...)
                        }
                        extra = append(extra, ipString(ip))
                }
        case "ipv6":
                for i, ip := range configs {
                        if ip.IP.To4() == nil {
                                return ipString(ip), append(extra, toStrings(configs[i+1:])...)
                        }
                        extra = append(extra, ipString(ip))
                }
        case "cni":
                // use func default return
        }

        all := toStrings(configs)
        return all[0], all[1:]
}

func ipString(ip *cni.IPConfig) string {
        return ip.IP.String()
}

func logDebugCNIResult(ctx context.Context, sandboxID string, result *cni.Result) {
        if log.GetLevel() < log.DebugLevel {
                return
        }
        cniResult, err := json.Marshal(result)
        if err != nil {
                log.G(ctx).WithField("podsandboxid", sandboxID).WithError(err).Errorf("Failed to marshal CNI result: %v", err)
                return
        }
        log.G(ctx).WithField("podsandboxid", sandboxID).Debugf("cni result: %s", string(cniResult))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/platforms"

        "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/sandbox"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
)

type criSandboxService struct {
        sandboxControllers map[string]sandbox.Controller
        config             *criconfig.Config
}

func newCriSandboxService(config *criconfig.Config, sandboxers map[string]sandbox.Controller) *criSandboxService {
        return &criSandboxService{
                sandboxControllers: sandboxers,
                config:             config,
        }
}

func (c *criSandboxService) SandboxController(sandboxer string) (sandbox.Controller, error) {
        sbController, ok := c.sandboxControllers[sandboxer]
        if !ok {
                return nil, fmt.Errorf("failed to get sandbox controller by %s", sandboxer)
        }
        return sbController, nil
}

func (c *criSandboxService) CreateSandbox(ctx context.Context, info sandbox.Sandbox, opts ...sandbox.CreateOpt) error {
        ctrl, err := c.SandboxController(info.Sandboxer)
        if err != nil {
                return err
        }
        return ctrl.Create(ctx, info, opts...)
}

func (c *criSandboxService) StartSandbox(ctx context.Context, sandboxer string, sandboxID string) (sandbox.ControllerInstance, error) {
        ctrl, err := c.SandboxController(sandboxer)
        if err != nil {
                return sandbox.ControllerInstance{}, err
        }
        return ctrl.Start(ctx, sandboxID)
}

func (c *criSandboxService) WaitSandbox(ctx context.Context, sandboxer string, sandboxID string) (<-chan client.ExitStatus, error) {
        ctrl, err := c.SandboxController(sandboxer)
        if err != nil {
                return nil, err
        }

        ch := make(chan client.ExitStatus, 1)
        go func() {
                defer close(ch)

                exitStatus, err := ctrl.Wait(ctx, sandboxID)
                if err != nil {
                        ch <- *client.NewExitStatus(client.UnknownExitStatus, time.Time{}, err)
                        return
                }

                ch <- *client.NewExitStatus(exitStatus.ExitStatus, exitStatus.ExitedAt, nil)
        }()

        return ch, nil
}

func (c *criSandboxService) SandboxStatus(ctx context.Context, sandboxer string, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
        ctrl, err := c.SandboxController(sandboxer)
        if err != nil {
                return sandbox.ControllerStatus{}, err
        }
        return ctrl.Status(ctx, sandboxID, verbose)
}

func (c *criSandboxService) SandboxPlatform(ctx context.Context, sandboxer string, sandboxID string) (platforms.Platform, error) {
        ctrl, err := c.SandboxController(sandboxer)
        if err != nil {
                return platforms.Platform{}, err
        }
        return ctrl.Platform(ctx, sandboxID)
}

func (c *criSandboxService) ShutdownSandbox(ctx context.Context, sandboxer string, sandboxID string) error {
        ctrl, err := c.SandboxController(sandboxer)
        if err != nil {
                return err
        }
        return ctrl.Shutdown(ctx, sandboxID)
}

func (c *criSandboxService) StopSandbox(ctx context.Context, sandboxer, sandboxID string, opts ...sandbox.StopOpt) error {
        ctrl, err := c.SandboxController(sandboxer)
        if err != nil {
                return err
        }
        return ctrl.Stop(ctx, sandboxID, opts...)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (c *criService) PodSandboxStats(
        ctx context.Context,
        r *runtime.PodSandboxStatsRequest,
) (*runtime.PodSandboxStatsResponse, error) {

        sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when trying to find sandbox %s: %w", r.GetPodSandboxId(), err)
        }

        podSandboxStats, err := c.podSandboxStats(ctx, sandbox)
        if err != nil {
                return nil, fmt.Errorf("failed to decode pod sandbox metrics %s: %w", r.GetPodSandboxId(), err)
        }

        return &runtime.PodSandboxStatsResponse{Stats: podSandboxStats}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "time"

        "github.com/containerd/cgroups/v3"
        "github.com/containerd/cgroups/v3/cgroup1"
        cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containernetworking/plugins/pkg/ns"
        "github.com/vishvananda/netlink"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func (c *criService) podSandboxStats(
        ctx context.Context,
        sandbox sandboxstore.Sandbox) (*runtime.PodSandboxStats, error) {
        meta := sandbox.Metadata

        if sandbox.Status.Get().State != sandboxstore.StateReady {
                return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
        }

        stats, err := metricsForSandbox(sandbox)
        if err != nil {
                return nil, fmt.Errorf("failed getting metrics for sandbox %s: %w", sandbox.ID, err)
        }

        podSandboxStats := &runtime.PodSandboxStats{
                Linux: &runtime.LinuxPodSandboxStats{},
                Attributes: &runtime.PodSandboxAttributes{
                        Id:          meta.ID,
                        Metadata:    meta.Config.GetMetadata(),
                        Labels:      meta.Config.GetLabels(),
                        Annotations: meta.Config.GetAnnotations(),
                },
        }

        if stats != nil {
                timestamp := time.Now()

                cpuStats, err := c.cpuContainerStats(meta.ID, true /* isSandbox */, stats, timestamp)
                if err != nil {
                        return nil, fmt.Errorf("failed to obtain cpu stats: %w", err)
                }
                podSandboxStats.Linux.Cpu = cpuStats

                memoryStats, err := c.memoryContainerStats(meta.ID, stats, timestamp)
                if err != nil {
                        return nil, fmt.Errorf("failed to obtain memory stats: %w", err)
                }
                podSandboxStats.Linux.Memory = memoryStats

                if sandbox.NetNSPath != "" {
                        rxBytes, rxErrors, txBytes, txErrors := getContainerNetIO(ctx, sandbox.NetNSPath)
                        podSandboxStats.Linux.Network = &runtime.NetworkUsage{
                                DefaultInterface: &runtime.NetworkInterfaceUsage{
                                        Name:     defaultIfName,
                                        RxBytes:  &runtime.UInt64Value{Value: rxBytes},
                                        RxErrors: &runtime.UInt64Value{Value: rxErrors},
                                        TxBytes:  &runtime.UInt64Value{Value: txBytes},
                                        TxErrors: &runtime.UInt64Value{Value: txErrors},
                                },
                        }
                }

                var pidCount uint64
                for _, cntr := range c.containerStore.List() {
                        if cntr.SandboxID != sandbox.ID {
                                continue
                        }

                        state := cntr.Status.Get().State()
                        if state != runtime.ContainerState_CONTAINER_RUNNING {
                                continue
                        }

                        task, err := cntr.Container.Task(ctx, nil)
                        if err != nil {
                                return nil, err
                        }

                        processes, err := task.Pids(ctx)
                        if err != nil {
                                return nil, err
                        }
                        pidCount += uint64(len(processes))

                }
                podSandboxStats.Linux.Process = &runtime.ProcessUsage{
                        Timestamp:    timestamp.UnixNano(),
                        ProcessCount: &runtime.UInt64Value{Value: pidCount},
                }

                listContainerStatsRequest := &runtime.ListContainerStatsRequest{Filter: &runtime.ContainerStatsFilter{PodSandboxId: meta.ID}}
                resp, err := c.ListContainerStats(ctx, listContainerStatsRequest)
                if err != nil {
                        return nil, fmt.Errorf("failed to obtain container stats during podSandboxStats call: %w", err)
                }
                podSandboxStats.Linux.Containers = resp.GetStats()
        }

        return podSandboxStats, nil
}

// https://github.com/cri-o/cri-o/blob/74a5cf8dffd305b311eb1c7f43a4781738c388c1/internal/oci/stats.go#L32
func getContainerNetIO(ctx context.Context, netNsPath string) (rxBytes, rxErrors, txBytes, txErrors uint64) {
        ns.WithNetNSPath(netNsPath, func(_ ns.NetNS) error {
                link, err := netlink.LinkByName(defaultIfName)
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("unable to retrieve network namespace stats for netNsPath: %v, interface: %v", netNsPath, defaultIfName)
                        return err
                }
                attrs := link.Attrs()
                if attrs != nil && attrs.Statistics != nil {
                        rxBytes = attrs.Statistics.RxBytes
                        rxErrors = attrs.Statistics.RxErrors
                        txBytes = attrs.Statistics.TxBytes
                        txErrors = attrs.Statistics.TxErrors
                }
                return nil
        })

        return rxBytes, rxErrors, txBytes, txErrors
}

func metricsForSandbox(sandbox sandboxstore.Sandbox) (interface{}, error) {
        cgroupPath := sandbox.Config.GetLinux().GetCgroupParent()

        if cgroupPath == "" {
                return nil, fmt.Errorf("failed to get cgroup metrics for sandbox %v because cgroupPath is empty", sandbox.ID)
        }

        var statsx interface{}
        if cgroups.Mode() == cgroups.Unified {
                cg, err := cgroupsv2.Load(cgroupPath)
                if err != nil {
                        return nil, fmt.Errorf("failed to load sandbox cgroup: %v: %w", cgroupPath, err)
                }
                stats, err := cg.Stat()
                if err != nil {
                        return nil, fmt.Errorf("failed to get stats for cgroup: %v: %w", cgroupPath, err)
                }
                statsx = stats

        } else {
                control, err := cgroup1.Load(cgroup1.StaticPath(cgroupPath))
                if err != nil {
                        return nil, fmt.Errorf("failed to load sandbox cgroup %v: %w", cgroupPath, err)
                }
                stats, err := control.Stat(cgroup1.IgnoreNotExist)
                if err != nil {
                        return nil, fmt.Errorf("failed to get stats for cgroup %v: %w", cgroupPath, err)
                }
                statsx = stats
        }

        return statsx, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"

        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// ListPodSandboxStats returns stats of all ready sandboxes.
func (c *criService) ListPodSandboxStats(
        ctx context.Context,
        r *runtime.ListPodSandboxStatsRequest,
) (*runtime.ListPodSandboxStatsResponse, error) {
        sandboxes := c.sandboxesForListPodSandboxStatsRequest(r)

        var errs []error
        podSandboxStats := new(runtime.ListPodSandboxStatsResponse)
        for _, sandbox := range sandboxes {
                sandboxStats, err := c.podSandboxStats(ctx, sandbox)
                switch {
                case errdefs.IsUnavailable(err), errdefs.IsNotFound(err):
                        log.G(ctx).WithField("podsandboxid", sandbox.ID).Debugf("failed to get pod sandbox stats, this is likely a transient error: %v", err)
                case err != nil:
                        errs = append(errs, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err))
                default:
                        podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
                }
        }

        return podSandboxStats, errors.Join(errs...)
}

func (c *criService) sandboxesForListPodSandboxStatsRequest(r *runtime.ListPodSandboxStatsRequest) []sandboxstore.Sandbox {
        sandboxesInStore := c.sandboxStore.List()

        if r.GetFilter() == nil {
                return sandboxesInStore
        }

        c.normalizePodSandboxStatsFilter(r.GetFilter())

        var sandboxes []sandboxstore.Sandbox
        for _, sandbox := range sandboxesInStore {
                if r.GetFilter().GetId() != "" && sandbox.ID != r.GetFilter().GetId() {
                        continue
                }

                if r.GetFilter().GetLabelSelector() != nil &&
                        !matchLabelSelector(r.GetFilter().GetLabelSelector(), sandbox.Config.GetLabels()) {
                        continue
                }

                // We can't obtain metrics for sandboxes that aren't in ready state
                if sandbox.Status.Get().State != sandboxstore.StateReady {
                        continue
                }

                sandboxes = append(sandboxes, sandbox)
        }

        return sandboxes
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "encoding/json"
        "fmt"
        "time"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/types"
        "github.com/containerd/errdefs"
)

// PodSandboxStatus returns the status of the PodSandbox.
func (c *criService) PodSandboxStatus(ctx context.Context, r *runtime.PodSandboxStatusRequest) (*runtime.PodSandboxStatusResponse, error) {
        sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find sandbox: %w", err)
        }

        ip, additionalIPs, err := c.getIPs(sandbox)
        if err != nil {
                return nil, fmt.Errorf("failed to get sandbox ip: %w", err)
        }

        var (
                createdAt time.Time
                state     string
                info      map[string]string
        )
        cstatus, err := c.sandboxService.SandboxStatus(ctx, sandbox.Sandboxer, sandbox.ID, r.GetVerbose())
        if err != nil {
                // If the shim died unexpectedly (segfault etc.) let's set the state as
                // NOTREADY and not just error out to make k8s and clients like crictl
                // happy. If we get back ErrNotFound from controller.Status above while
                // we're using the shim-mode controller, this is a decent indicator it
                // exited unexpectedly. We can use the fact that we successfully retrieved
                // the sandbox object from the store above to tell that this is true, otherwise
                // if we followed the normal k8s convention of StopPodSandbox -> RemovePodSandbox,
                // we wouldn't have that object in the store anymore.
                if !errdefs.IsNotFound(err) {
                        return nil, fmt.Errorf("failed to query controller status: %w", err)
                }
                state = runtime.PodSandboxState_SANDBOX_NOTREADY.String()
                if r.GetVerbose() {
                        info, err = toDeletedCRISandboxInfo(sandbox)
                        if err != nil {
                                return nil, err
                        }
                }
        } else {
                state = cstatus.State
                createdAt = cstatus.CreatedAt
                info = cstatus.Info
        }

        status := toCRISandboxStatus(sandbox.Metadata, state, createdAt, ip, additionalIPs)
        if status.GetCreatedAt() == 0 {
                // CRI doesn't allow CreatedAt == 0.
                sandboxInfo, err := c.client.SandboxStore().Get(ctx, sandbox.ID)
                if err != nil {
                        return nil, fmt.Errorf("failed to get sandbox %q from metadata store: %w", sandbox.ID, err)
                }
                status.CreatedAt = sandboxInfo.CreatedAt.UnixNano()
        }

        return &runtime.PodSandboxStatusResponse{
                Status: status,
                Info:   info,
        }, nil
}

func (c *criService) getIPs(sandbox sandboxstore.Sandbox) (string, []string, error) {
        config := sandbox.Config

        // For sandboxes using the node network we are not
        // responsible for reporting the IP.
        if hostNetwork(config) {
                return "", nil, nil
        }

        if closed, err := sandbox.NetNS.Closed(); err != nil {
                return "", nil, fmt.Errorf("check network namespace closed: %w", err)
        } else if closed {
                return "", nil, nil
        }

        return sandbox.IP, sandbox.AdditionalIPs, nil
}

// toCRISandboxStatus converts sandbox metadata into CRI pod sandbox status.
func toCRISandboxStatus(meta sandboxstore.Metadata, status string, createdAt time.Time, ip string, additionalIPs []string) *runtime.PodSandboxStatus {
        // Set sandbox state to NOTREADY by default.
        state := runtime.PodSandboxState_SANDBOX_NOTREADY
        if value, ok := runtime.PodSandboxState_value[status]; ok {
                state = runtime.PodSandboxState(value)
        }
        nsOpts := meta.Config.GetLinux().GetSecurityContext().GetNamespaceOptions()
        var ips []*runtime.PodIP
        for _, additionalIP := range additionalIPs {
                ips = append(ips, &runtime.PodIP{Ip: additionalIP})
        }
        return &runtime.PodSandboxStatus{
                Id:        meta.ID,
                Metadata:  meta.Config.GetMetadata(),
                State:     state,
                CreatedAt: createdAt.UnixNano(),
                Network: &runtime.PodSandboxNetworkStatus{
                        Ip:            ip,
                        AdditionalIps: ips,
                },
                Linux: &runtime.LinuxPodSandboxStatus{
                        Namespaces: &runtime.Namespace{
                                Options: nsOpts,
                        },
                },
                Labels:         meta.Config.GetLabels(),
                Annotations:    meta.Config.GetAnnotations(),
                RuntimeHandler: meta.RuntimeHandler,
        }
}

// toDeletedCRISandboxInfo converts cached sandbox to CRI sandbox status response info map.
// In most cases, controller.Status() with verbose=true should have SandboxInfo in the return,
// but if controller.Status() returns a NotFound error,
// we should fallback to get SandboxInfo from cached sandbox itself.
func toDeletedCRISandboxInfo(sandbox sandboxstore.Sandbox) (map[string]string, error) {
        si := &types.SandboxInfo{
                Pid:            sandbox.Status.Get().Pid,
                Config:         sandbox.Config,
                RuntimeHandler: sandbox.RuntimeHandler,
                CNIResult:      sandbox.CNIResult,
        }

        // If processStatus is empty, it means that the task is deleted. Apply "deleted"
        // status which does not exist in containerd.
        si.Status = "deleted"

        if sandbox.NetNS != nil {
                // Add network closed information if sandbox is not using host network.
                closed, err := sandbox.NetNS.Closed()
                if err != nil {
                        return nil, fmt.Errorf("failed to check network namespace closed: %w", err)
                }
                si.NetNSClosed = closed
        }

        si.Metadata = &sandbox.Metadata

        infoBytes, err := json.Marshal(si)
        if err != nil {
                return nil, fmt.Errorf("failed to marshal info %v: %w", si, err)
        }

        return map[string]string{
                "info": string(infoBytes),
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "errors"
        "fmt"
        "time"

        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        "github.com/containerd/errdefs"
)

// StopPodSandbox stops the sandbox. If there are any running containers in the
// sandbox, they should be forcibly terminated.
func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandboxRequest) (*runtime.StopPodSandboxResponse, error) {
        sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
        if err != nil {
                return nil, fmt.Errorf("an error occurred when try to find sandbox %q: %w",
                        r.GetPodSandboxId(), err)
        }

        if err := c.stopPodSandbox(ctx, sandbox); err != nil {
                return nil, err
        }

        return &runtime.StopPodSandboxResponse{}, nil
}

func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sandbox) error {
        // Use the full sandbox id.
        id := sandbox.ID

        // Stop all containers inside the sandbox. This terminates the container forcibly,
        // and container may still be created, so production should not rely on this behavior.
        // TODO(random-liu): Introduce a state in sandbox to avoid future container creation.
        stop := time.Now()
        containers := c.containerStore.List()
        for _, container := range containers {
                if container.SandboxID != id {
                        continue
                }
                // Forcibly stop the container. Do not use `StopContainer`, because it introduces a race
                // if a container is removed after list.
                if err := c.stopContainer(ctx, container, 0); err != nil {
                        return fmt.Errorf("failed to stop container %q: %w", container.ID, err)
                }
        }

        // Only stop sandbox container when it's running or unknown.
        state := sandbox.Status.Get().State
        if state == sandboxstore.StateReady || state == sandboxstore.StateUnknown {
                if err := c.sandboxService.StopSandbox(ctx, sandbox.Sandboxer, id); err != nil {
                        // Log and ignore the error if controller already removed the sandbox
                        if errdefs.IsNotFound(err) {
                                log.G(ctx).Warnf("sandbox %q is not found when stopping it", id)
                        } else {
                                return fmt.Errorf("failed to stop sandbox %q: %w", id, err)
                        }
                }
        }

        sandboxRuntimeStopTimer.WithValues(sandbox.RuntimeHandler).UpdateSince(stop)

        err := c.nri.StopPodSandbox(ctx, &sandbox)
        if err != nil {
                log.G(ctx).WithError(err).Errorf("NRI sandbox stop notification failed")
        }

        // Teardown network for sandbox.
        if sandbox.NetNS != nil {
                netStop := time.Now()
                // Use empty netns path if netns is not available. This is defined in:
                // https://github.com/containernetworking/cni/blob/v0.7.0-alpha1/SPEC.md
                if closed, err := sandbox.NetNS.Closed(); err != nil {
                        return fmt.Errorf("failed to check network namespace closed: %w", err)
                } else if closed {
                        sandbox.NetNSPath = ""
                }
                if err := c.teardownPodNetwork(ctx, sandbox); err != nil {
                        return fmt.Errorf("failed to destroy network for sandbox %q: %w", id, err)
                }
                if err := sandbox.NetNS.Remove(); err != nil {
                        return fmt.Errorf("failed to remove network namespace for sandbox %q: %w", id, err)
                }
                sandboxDeleteNetwork.UpdateSince(netStop)
        }

        log.G(ctx).Infof("TearDown network for sandbox %q successfully", id)

        return nil
}

// waitSandboxStop waits for sandbox to be stopped until context is cancelled or
// the context deadline is exceeded.
func (c *criService) waitSandboxStop(ctx context.Context, sandbox sandboxstore.Sandbox) error {
        select {
        case <-ctx.Done():
                return fmt.Errorf("wait sandbox container %q: %w", sandbox.ID, ctx.Err())
        case <-sandbox.Stopped():
                return nil
        }
}

// teardownPodNetwork removes the network from the pod
func (c *criService) teardownPodNetwork(ctx context.Context, sandbox sandboxstore.Sandbox) error {
        netPlugin := c.getNetworkPlugin(sandbox.RuntimeHandler)
        if netPlugin == nil {
                return errors.New("cni config not initialized")
        }

        var (
                id     = sandbox.ID
                path   = sandbox.NetNSPath
                config = sandbox.Config
        )
        opts, err := cniNamespaceOpts(id, config)
        if err != nil {
                return fmt.Errorf("get cni namespace options: %w", err)
        }

        netStart := time.Now()
        err = netPlugin.Remove(ctx, id, path, opts...)
        networkPluginOperations.WithValues(networkTearDownOp).Inc()
        networkPluginOperationsLatency.WithValues(networkTearDownOp).UpdateSince(netStart)
        if err != nil {
                networkPluginOperationsErrors.WithValues(networkTearDownOp).Inc()
                return err
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "io"
        "net/http"
        "slices"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/go-cni"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/containerd/typeurl/v2"
        "github.com/opencontainers/runtime-spec/specs-go/features"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
        "k8s.io/kubelet/pkg/cri/streaming"

        apitypes "github.com/containerd/containerd/api/types"
        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/introspection"
        _ "github.com/containerd/containerd/v2/core/runtime" // for typeurl init
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/internal/cri/config"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/nri"
        "github.com/containerd/containerd/v2/internal/cri/server/events"
        containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
        imagestore "github.com/containerd/containerd/v2/internal/cri/store/image"
        "github.com/containerd/containerd/v2/internal/cri/store/label"
        sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
        snapshotstore "github.com/containerd/containerd/v2/internal/cri/store/snapshot"
        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/containerd/v2/internal/eventq"
        "github.com/containerd/containerd/v2/internal/registrar"
        "github.com/containerd/containerd/v2/pkg/oci"
        osinterface "github.com/containerd/containerd/v2/pkg/os"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/plugins"
)

var kernelSupportsRRO bool

// defaultNetworkPlugin is used for the default CNI configuration
const defaultNetworkPlugin = "default"

// CRIService is the interface implement CRI remote service server.
type CRIService interface {
        // Closer is used by containerd to gracefully stop cri service.
        io.Closer

        IsInitialized() bool

        Run(ready func()) error
}

type sandboxService interface {
        CreateSandbox(ctx context.Context, info sandbox.Sandbox, opts ...sandbox.CreateOpt) error
        StartSandbox(ctx context.Context, sandboxer string, sandboxID string) (sandbox.ControllerInstance, error)
        WaitSandbox(ctx context.Context, sandboxer string, sandboxID string) (<-chan containerd.ExitStatus, error)
        StopSandbox(ctx context.Context, sandboxer, sandboxID string, opts ...sandbox.StopOpt) error
        ShutdownSandbox(ctx context.Context, sandboxer string, sandboxID string) error
        SandboxStatus(ctx context.Context, sandboxer string, sandboxID string, verbose bool) (sandbox.ControllerStatus, error)
        SandboxPlatform(ctx context.Context, sandboxer string, sandboxID string) (platforms.Platform, error)
        SandboxController(sandboxer string) (sandbox.Controller, error)
}

// RuntimeService specifies dependencies to runtime service which provides
// the runtime configuration and OCI spec loading.
type RuntimeService interface {
        Config() criconfig.Config

        // LoadCISpec loads cached OCI specs via `Runtime.BaseRuntimeSpec`
        LoadOCISpec(string) (*oci.Spec, error)
}

// ImageService specifies dependencies to image service.
type ImageService interface {
        RuntimeSnapshotter(ctx context.Context, ociRuntime criconfig.Runtime) string

        PullImage(ctx context.Context, name string, credentials func(string) (string, string, error), sandboxConfig *runtime.PodSandboxConfig, runtimeHandler string) (string, error)
        UpdateImage(ctx context.Context, r string) error

        CheckImages(ctx context.Context) error

        GetImage(id string) (imagestore.Image, error)
        GetSnapshot(key, snapshotter string) (snapshotstore.Snapshot, error)

        LocalResolve(refOrID string) (imagestore.Image, error)

        ImageFSPaths() map[string]string
}

// criService implements CRIService.
type criService struct {
        RuntimeService
        ImageService
        // config contains all configurations.
        config criconfig.Config
        // imageFSPaths contains path to image filesystem for snapshotters.
        imageFSPaths map[string]string
        // os is an interface for all required os operations.
        os osinterface.OS
        // sandboxStore stores all resources associated with sandboxes.
        sandboxStore *sandboxstore.Store
        // sandboxNameIndex stores all sandbox names and make sure each name
        // is unique.
        sandboxNameIndex *registrar.Registrar
        // containerStore stores all resources associated with containers.
        containerStore *containerstore.Store
        // containerNameIndex stores all container names and make sure each
        // name is unique.
        containerNameIndex *registrar.Registrar
        // netPlugin is used to setup and teardown network when run/stop pod sandbox.
        netPlugin map[string]cni.CNI
        // client is an instance of the containerd client
        client *containerd.Client
        // streamServer is the streaming server serves container streaming request.
        streamServer streaming.Server
        // eventMonitor is the monitor monitors containerd events.
        eventMonitor *events.EventMonitor
        // initialized indicates whether the server is initialized. All GRPC services
        // should return error before the server is initialized.
        initialized atomic.Bool
        // cniNetConfMonitor is used to reload cni network conf if there is
        // any valid fs change events from cni network conf dir.
        cniNetConfMonitor map[string]*cniNetConfSyncer
        // allCaps is the list of the capabilities.
        // When nil, parsed from CapEff of /proc/self/status.
        allCaps []string //nolint:nolintlint,unused // Ignore on non-Linux
        // containerEventsQ is used to capture container events and send them
        // to the callers of GetContainerEvents.
        containerEventsQ eventq.EventQueue[runtime.ContainerEventResponse]
        // nri is used to hook NRI into CRI request processing.
        nri *nri.API
        // sandboxService is the sandbox related service for CRI
        sandboxService sandboxService
        // runtimeHandlers contains runtime handler info
        runtimeHandlers []*runtime.RuntimeHandler
}

type CRIServiceOptions struct {
        RuntimeService RuntimeService

        ImageService ImageService

        StreamingConfig streaming.Config

        NRI *nri.API

        // SandboxControllers is a map of all the loaded sandbox controllers
        SandboxControllers map[string]sandbox.Controller

        // Client is the base containerd client used for accessing services,
        //
        // TODO: Replace this gradually with directly configured instances
        Client *containerd.Client
}

// NewCRIService returns a new instance of CRIService
func NewCRIService(options *CRIServiceOptions) (CRIService, runtime.RuntimeServiceServer, error) {
        ctx := context.Background()
        var err error
        labels := label.NewStore()
        config := options.RuntimeService.Config()

        c := &criService{
                RuntimeService:     options.RuntimeService,
                ImageService:       options.ImageService,
                config:             config,
                client:             options.Client,
                imageFSPaths:       options.ImageService.ImageFSPaths(),
                os:                 osinterface.RealOS{},
                sandboxStore:       sandboxstore.NewStore(labels),
                containerStore:     containerstore.NewStore(labels),
                sandboxNameIndex:   registrar.NewRegistrar(),
                containerNameIndex: registrar.NewRegistrar(),
                netPlugin:          make(map[string]cni.CNI),
                sandboxService:     newCriSandboxService(&config, options.SandboxControllers),
        }

        // TODO: Make discard time configurable
        c.containerEventsQ = eventq.New[runtime.ContainerEventResponse](5*time.Minute, func(event runtime.ContainerEventResponse) {
                containerEventsDroppedCount.Inc()
                log.L.WithFields(
                        log.Fields{
                                "container": event.ContainerId,
                                "type":      event.ContainerEventType,
                        }).Warn("container event discarded")
        })

        if err := c.initPlatform(); err != nil {
                return nil, nil, fmt.Errorf("initialize platform: %w", err)
        }

        // prepare streaming server
        c.streamServer, err = streaming.NewServer(options.StreamingConfig, newStreamRuntime(c))
        if err != nil {
                return nil, nil, fmt.Errorf("failed to create stream server: %w", err)
        }

        c.eventMonitor = events.NewEventMonitor(&criEventHandler{c: c})

        c.cniNetConfMonitor = make(map[string]*cniNetConfSyncer)
        for name, i := range c.netPlugin {
                path := c.config.NetworkPluginConfDir
                if name != defaultNetworkPlugin {
                        if rc, ok := c.config.Runtimes[name]; ok {
                                path = rc.NetworkPluginConfDir
                        }
                }
                if path != "" {
                        m, err := newCNINetConfSyncer(path, i, c.cniLoadOptions())
                        if err != nil {
                                return nil, nil, fmt.Errorf("failed to create cni conf monitor for %s: %w", name, err)
                        }
                        c.cniNetConfMonitor[name] = m
                }
        }

        c.nri = options.NRI

        c.runtimeHandlers, err = c.introspectRuntimeHandlers(ctx)
        if err != nil {
                return nil, nil, fmt.Errorf("failed to introspect runtime handlers: %w", err)
        }

        return c, c, nil
}

// Run starts the CRI service.
func (c *criService) Run(ready func()) error {
        log.L.Info("Start subscribing containerd event")
        // note: filters are any match, if you want any match but not in namespace foo
        // then you have to manually filter namespace foo
        c.eventMonitor.Subscribe(c.client, []string{`topic=="/tasks/oom"`, `topic~="/images/"`})

        log.L.Infof("Start recovering state")
        if err := c.recover(ctrdutil.NamespacedContext()); err != nil {
                return fmt.Errorf("failed to recover state: %w", err)
        }

        // Start event handler.
        log.L.Info("Start event monitor")
        eventMonitorErrCh := c.eventMonitor.Start()

        // Start CNI network conf syncers
        cniNetConfMonitorErrCh := make(chan error, len(c.cniNetConfMonitor))
        var netSyncGroup sync.WaitGroup
        for name, h := range c.cniNetConfMonitor {
                netSyncGroup.Add(1)
                log.L.Infof("Start cni network conf syncer for %s", name)
                go func(h *cniNetConfSyncer) {
                        cniNetConfMonitorErrCh <- h.syncLoop()
                        netSyncGroup.Done()
                }(h)
        }
        // For platforms that may not support CNI (darwin etc.) there's no
        // use in launching this as `Wait` will return immediately. Further
        // down we select on this channel along with some others to determine
        // if we should Close() the CRI service, so closing this preemptively
        // isn't good.
        if len(c.cniNetConfMonitor) > 0 {
                go func() {
                        netSyncGroup.Wait()
                        close(cniNetConfMonitorErrCh)
                }()
        }

        // Start streaming server.
        log.L.Info("Start streaming server")
        streamServerErrCh := make(chan error)
        go func() {
                defer close(streamServerErrCh)
                if err := c.streamServer.Start(true); err != nil && err != http.ErrServerClosed {
                        log.L.WithError(err).Error("Failed to start streaming server")
                        streamServerErrCh <- err
                }
        }()

        // register CRI domain with NRI
        if err := c.nri.Register(&criImplementation{c}); err != nil {
                return fmt.Errorf("failed to set up NRI for CRI service: %w", err)
        }

        // Set the server as initialized. GRPC services could start serving traffic.
        c.initialized.Store(true)
        ready()

        var eventMonitorErr, streamServerErr, cniNetConfMonitorErr error
        // Stop the whole CRI service if any of the critical service exits.
        select {
        case eventMonitorErr = <-eventMonitorErrCh:
        case streamServerErr = <-streamServerErrCh:
        case cniNetConfMonitorErr = <-cniNetConfMonitorErrCh:
        }
        if err := c.Close(); err != nil {
                return fmt.Errorf("failed to stop cri service: %w", err)
        }
        // If the error is set above, err from channel must be nil here, because
        // the channel is supposed to be closed. Or else, we wait and set it.
        if err := <-eventMonitorErrCh; err != nil {
                eventMonitorErr = err
        }
        log.L.Info("Event monitor stopped")
        if err := <-streamServerErrCh; err != nil {
                streamServerErr = err
        }
        log.L.Info("Stream server stopped")
        if eventMonitorErr != nil {
                return fmt.Errorf("event monitor error: %w", eventMonitorErr)
        }
        if streamServerErr != nil {
                return fmt.Errorf("stream server error: %w", streamServerErr)
        }
        if cniNetConfMonitorErr != nil {
                return fmt.Errorf("cni network conf monitor error: %w", cniNetConfMonitorErr)
        }
        return nil
}

// Close stops the CRI service.
// TODO(random-liu): Make close synchronous.
func (c *criService) Close() error {
        log.L.Info("Stop CRI service")
        for name, h := range c.cniNetConfMonitor {
                if err := h.stop(); err != nil {
                        log.L.WithError(err).Errorf("failed to stop cni network conf monitor for %s", name)
                }
        }
        c.eventMonitor.Stop()
        if err := c.streamServer.Stop(); err != nil {
                return fmt.Errorf("failed to stop stream server: %w", err)
        }
        return nil
}

// IsInitialized indicates whether CRI service has finished initialization.
func (c *criService) IsInitialized() bool {
        return c.initialized.Load()
}

func (c *criService) introspectRuntimeHandlers(ctx context.Context) ([]*runtime.RuntimeHandler, error) {
        var res []*runtime.RuntimeHandler
        intro := c.client.IntrospectionService()
        for name, r := range c.config.Runtimes {
                h := runtime.RuntimeHandler{
                        Name: name,
                }
                rawFeatures, err := introspectRuntimeFeatures(ctx, intro, r)
                if err != nil {
                        log.G(ctx).WithError(err).Debugf("failed to introspect features of runtime %q", name)
                } else {
                        h.Features = &runtime.RuntimeHandlerFeatures{}
                        if slices.Contains(rawFeatures.MountOptions, "rro") {
                                if kernelSupportsRRO {
                                        log.G(ctx).Debugf("runtime %q supports recursive read-only mounts", name)
                                        h.Features.RecursiveReadOnlyMounts = true
                                } else {
                                        log.G(ctx).Debugf("runtime %q supports recursive read-only mounts, but the kernel does not", name)
                                }
                        }
                        userns := supportsCRIUserns(rawFeatures)
                        h.Features.UserNamespaces = userns
                        log.G(ctx).Debugf("runtime %q supports CRI userns: %v", name, userns)
                }
                res = append(res, &h)
                if name == c.config.DefaultRuntimeName {
                        defH := h
                        defH.Name = "" // denotes default
                        res = append(res, &defH)
                }
        }
        return res, nil
}

func introspectRuntimeFeatures(ctx context.Context, intro introspection.Service, r config.Runtime) (*features.Features, error) {
        if r.Type != plugins.RuntimeRuncV2 {
                return nil, fmt.Errorf("introspecting OCI runtime features needs the runtime type to be %q, got %q",
                        plugins.RuntimeRuncV2, r.Type)
                // For other runtimes, protobuf.MarshalAnyToProto will cause nil panic during typeurl dereference
        }

        rr := &apitypes.RuntimeRequest{
                RuntimePath: r.Type, // "io.containerd.runc.v2"
        }
        if r.Path != "" {
                rr.RuntimePath = r.Path // "/usr/local/bin/crun"
        }
        options, err := config.GenerateRuntimeOptions(r)
        if err != nil {
                return nil, err
        }
        if options != nil {
                rr.Options, err = protobuf.MarshalAnyToProto(options)
                if err != nil {
                        return nil, fmt.Errorf("failed to marshal %T: %w", options, err)
                }
        }

        infoResp, err := intro.PluginInfo(ctx, string(plugins.RuntimePluginV2), "task", rr)
        if err != nil {
                return nil, fmt.Errorf("failed to call PluginInfo: %w", err)
        }
        var info apitypes.RuntimeInfo
        if err := typeurl.UnmarshalTo(infoResp.Extra, &info); err != nil {
                return nil, fmt.Errorf("failed to get runtime info from plugin info: %w", err)
        }
        featuresX, err := typeurl.UnmarshalAny(info.Features)
        if err != nil {
                return nil, fmt.Errorf("failed to unmarshal Features (%T): %w", info.Features, err)
        }
        features, ok := featuresX.(*features.Features)
        if !ok {
                return nil, fmt.Errorf("unknown features type %T", featuresX)
        }
        return features, nil
}

func supportsCRIUserns(f *features.Features) bool {
        if f == nil {
                return false
        }
        userns := slices.Contains(f.Linux.Namespaces, "user")

        var idmap bool
        if m := f.Linux.MountExtensions; m != nil && m.IDMap != nil && m.IDMap.Enabled != nil {
                if *m.IDMap.Enabled {
                        idmap = true
                }
        }

        // user namespace support in CRI requires userns and idmap support.
        return userns && idmap
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "fmt"

        "github.com/opencontainers/selinux/go-selinux"
        "tags.cncf.io/container-device-interface/pkg/cdi"

        "github.com/containerd/containerd/v2/pkg/cap"
        "github.com/containerd/containerd/v2/pkg/kernelversion"
        "github.com/containerd/containerd/v2/pkg/userns"
        "github.com/containerd/go-cni"
        "github.com/containerd/log"
)

func init() {
        var err error
        kernelSupportsRRO, err = kernelversion.GreaterEqualThan(kernelversion.KernelVersion{Kernel: 5, Major: 12})
        if err != nil {
                panic(fmt.Errorf("failed to check kernel version: %w", err))
        }
}

// networkAttachCount is the minimum number of networks the PodSandbox
// attaches to
const networkAttachCount = 2

// initPlatform handles linux specific initialization for the CRI service.
func (c *criService) initPlatform() (err error) {
        if userns.RunningInUserNS() {
                if c.apparmorEnabled() || !c.config.RestrictOOMScoreAdj {
                        log.L.Warn("Running CRI plugin in a user namespace typically requires disable_apparmor and restrict_oom_score_adj to be true")
                }
        }

        if c.config.EnableSelinux {
                if !selinux.GetEnabled() {
                        log.L.Warn("Selinux is not supported")
                }
                if r := c.config.SelinuxCategoryRange; r > 0 {
                        selinux.CategoryRange = uint32(r)
                }
        } else {
                selinux.SetDisabled()
        }

        pluginDirs := map[string]string{
                defaultNetworkPlugin: c.config.NetworkPluginConfDir,
        }
        for name, conf := range c.config.Runtimes {
                if conf.NetworkPluginConfDir != "" {
                        pluginDirs[name] = conf.NetworkPluginConfDir
                }
        }

        c.netPlugin = make(map[string]cni.CNI)
        for name, dir := range pluginDirs {
                max := c.config.NetworkPluginMaxConfNum
                if name != defaultNetworkPlugin {
                        if m := c.config.Runtimes[name].NetworkPluginMaxConfNum; m != 0 {
                                max = m
                        }
                }
                // Pod needs to attach to at least loopback network and a non host network,
                // hence networkAttachCount is 2. If there are more network configs the
                // pod will be attached to all the networks but we will only use the ip
                // of the default network interface as the pod IP.
                i, err := cni.New(cni.WithMinNetworkCount(networkAttachCount),
                        cni.WithPluginConfDir(dir),
                        cni.WithPluginMaxConfNum(max),
                        cni.WithPluginDir([]string{c.config.NetworkPluginBinDir}))
                if err != nil {
                        return fmt.Errorf("failed to initialize cni: %w", err)
                }
                c.netPlugin[name] = i
        }

        if c.allCaps == nil {
                c.allCaps, err = cap.Current()
                if err != nil {
                        return fmt.Errorf("failed to get caps: %w", err)
                }
        }

        if c.config.EnableCDI {
                err := cdi.Configure(cdi.WithSpecDirs(c.config.CDISpecDirs...))
                if err != nil {
                        return fmt.Errorf("failed to configure CDI registry")
                }
        }

        return nil
}

// cniLoadOptions returns cni load options for the linux.
func (c *criService) cniLoadOptions() []cni.Opt {
        return []cni.Opt{cni.WithLoNetwork, cni.WithDefaultConf}
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "encoding/json"
        "fmt"
        goruntime "runtime"

        "github.com/containerd/containerd/api/services/introspection/v1"
        "github.com/containerd/log"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// networkNotReadyReason is the reason reported when network is not ready.
const networkNotReadyReason = "NetworkPluginNotReady"

// Status returns the status of the runtime.
func (c *criService) Status(ctx context.Context, r *runtime.StatusRequest) (*runtime.StatusResponse, error) {
        // As a containerd plugin, if CRI plugin is serving request,
        // containerd must be ready.
        runtimeCondition := &runtime.RuntimeCondition{
                Type:   runtime.RuntimeReady,
                Status: true,
        }
        networkCondition := &runtime.RuntimeCondition{
                Type:   runtime.NetworkReady,
                Status: true,
        }
        netPlugin := c.netPlugin[defaultNetworkPlugin]
        // Check the status of the cni initialization
        if netPlugin != nil {
                if err := netPlugin.Status(); err != nil {
                        networkCondition.Status = false
                        networkCondition.Reason = networkNotReadyReason
                        networkCondition.Message = fmt.Sprintf("Network plugin returns error: %v", err)
                }
        }

        resp := &runtime.StatusResponse{
                Status: &runtime.RuntimeStatus{Conditions: []*runtime.RuntimeCondition{
                        runtimeCondition,
                        networkCondition,
                }},
                RuntimeHandlers: c.runtimeHandlers,
        }
        if r.Verbose {
                configByt, err := json.Marshal(c.config)
                if err != nil {
                        return nil, err
                }
                resp.Info = make(map[string]string)
                resp.Info["config"] = string(configByt)
                versionByt, err := json.Marshal(goruntime.Version())
                if err != nil {
                        return nil, err
                }
                resp.Info["golang"] = string(versionByt)

                if netPlugin != nil {
                        cniConfig, err := json.Marshal(netPlugin.GetConfig())
                        if err != nil {
                                log.G(ctx).WithError(err).Errorf("Failed to marshal CNI config %v", err)
                        }
                        resp.Info["cniconfig"] = string(cniConfig)
                }

                defaultStatus := "OK"
                for name, h := range c.cniNetConfMonitor {
                        s := "OK"
                        if h == nil {
                                continue
                        }
                        if lerr := h.lastStatus(); lerr != nil {
                                s = lerr.Error()
                        }
                        resp.Info[fmt.Sprintf("lastCNILoadStatus.%s", name)] = s
                        if name == defaultNetworkPlugin {
                                defaultStatus = s
                        }
                }
                resp.Info["lastCNILoadStatus"] = defaultStatus
        }
        intro, err := c.client.IntrospectionService().Server(ctx)
        if err != nil {
                return nil, err
        }
        cond, err := runtimeConditionContainerdHasNoDeprecationWarnings(intro.Deprecations, c.config.IgnoreDeprecationWarnings)
        if err != nil {
                return nil, err
        }
        resp.Status.Conditions = append(resp.Status.Conditions, cond)
        return resp, nil
}

func runtimeConditionContainerdHasNoDeprecationWarnings(deprecations []*introspection.DeprecationWarning, ignore []string) (*runtime.RuntimeCondition, error) {
        cond := &runtime.RuntimeCondition{
                Type:   ContainerdHasNoDeprecationWarnings,
                Status: true,
        }
        ignoreM := make(map[string]struct{})
        for _, f := range ignore {
                ignoreM[f] = struct{}{}
        }
        messages := make(map[string]string) // key: id, value: message
        for _, d := range deprecations {
                if _, ok := ignoreM[d.ID]; !ok {
                        messages[d.ID] = d.Message
                }
        }
        if len(messages) > 0 {
                cond.Status = false
                cond.Reason = ContainerdHasDeprecationWarnings
                messageJ, err := json.Marshal(messages)
                if err != nil {
                        return nil, err
                }
                cond.Message = string(messageJ) // Arbitrary string
        }
        return cond, nil
}

const (
        // ContainerdHasNoDeprecationWarnings is a string for [runtime.RuntimeCondition.Type].
        ContainerdHasNoDeprecationWarnings = "ContainerdHasNoDeprecationWarnings"

        // ContainerdHasDeprecationWarnings is a string for [runtime.RuntimeCondition.Reason].
        // CamelCase is demanded by the spec.
        // https://github.com/kubernetes/cri-api/blob/v0.29.1/pkg/apis/runtime/v1/api.proto#L1514
        ContainerdHasDeprecationWarnings = "ContainerdHasDeprecationWarnings"
)

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "io"
        "math"

        "k8s.io/apimachinery/pkg/util/runtime"
        "k8s.io/client-go/tools/remotecommand"
        "k8s.io/utils/exec"

        ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
        "k8s.io/kubelet/pkg/cri/streaming"
)

type streamRuntime struct {
        c *criService
}

func newStreamRuntime(c *criService) streaming.Runtime {
        return &streamRuntime{c: c}
}

// Exec executes a command inside the container. exec.ExitError is returned if the command
// returns non-zero exit code.
func (s *streamRuntime) Exec(ctx context.Context, containerID string, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser,
        tty bool, resize <-chan remotecommand.TerminalSize) error {
        exitCode, err := s.c.execInContainer(ctrdutil.WithNamespace(ctx), containerID, execOptions{
                cmd:    cmd,
                stdin:  stdin,
                stdout: stdout,
                stderr: stderr,
                tty:    tty,
                resize: resize,
        })
        if err != nil {
                return fmt.Errorf("failed to exec in container: %w", err)
        }
        if *exitCode == 0 {
                return nil
        }
        return &exec.CodeExitError{
                Err:  fmt.Errorf("error executing command %v, exit code %d", cmd, *exitCode),
                Code: int(*exitCode),
        }
}

func (s *streamRuntime) Attach(ctx context.Context, containerID string, in io.Reader, out, err io.WriteCloser, tty bool,
        resize <-chan remotecommand.TerminalSize) error {
        return s.c.attachContainer(ctrdutil.WithNamespace(ctx), containerID, in, out, err, tty, resize)
}

func (s *streamRuntime) PortForward(ctx context.Context, podSandboxID string, port int32, stream io.ReadWriteCloser) error {
        if port <= 0 || port > math.MaxUint16 {
                return fmt.Errorf("invalid port %d", port)
        }
        ctx = ctrdutil.WithNamespace(ctx)
        return s.c.portForward(ctx, podSandboxID, port, stream)
}

// handleResizing spawns a goroutine that processes the resize channel, calling resizeFunc for each
// remotecommand.TerminalSize received from the channel.
func handleResizing(ctx context.Context, resize <-chan remotecommand.TerminalSize, resizeFunc func(size remotecommand.TerminalSize)) {
        if resize == nil {
                return
        }

        go func() {
                defer runtime.HandleCrash()

                for {
                        select {
                        case <-ctx.Done():
                                return
                        case size, ok := <-resize:
                                if !ok {
                                        return
                                }
                                if size.Height < 1 || size.Width < 1 {
                                        continue
                                }
                                resizeFunc(size)
                        }
                }
        }()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"
        "fmt"
        "net"
        "os"
        "path/filepath"
        "strings"
        "text/template"
        "time"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/pkg/atomicfile"
        "github.com/containerd/log"
)

// cniConfigTemplate contains the values containerd will overwrite
// in the cni config template.
type cniConfigTemplate struct {
        // PodCIDR is the cidr for pods on the node.
        PodCIDR string
        // PodCIDRRanges is the cidr ranges for pods on the node.
        PodCIDRRanges []string
        // Routes is a list of routes configured.
        Routes []string
}

const (
        // cniConfigFileName is the name of cni config file generated by containerd.
        cniConfigFileName = "10-containerd-net.conflist"
        // zeroCIDRv6 is the null route for IPv6.
        zeroCIDRv6 = "::/0"
        // zeroCIDRv4 is the null route for IPv4.
        zeroCIDRv4 = "0.0.0.0/0"
)

// UpdateRuntimeConfig updates the runtime config. Currently only handles podCIDR updates.
func (c *criService) UpdateRuntimeConfig(ctx context.Context, r *runtime.UpdateRuntimeConfigRequest) (*runtime.UpdateRuntimeConfigResponse, error) {
        podCIDRs := r.GetRuntimeConfig().GetNetworkConfig().GetPodCidr()
        if podCIDRs == "" {
                return &runtime.UpdateRuntimeConfigResponse{}, nil
        }
        cidrs := strings.Split(podCIDRs, ",")
        for i := range cidrs {
                cidrs[i] = strings.TrimSpace(cidrs[i])
        }
        routes, err := getRoutes(cidrs)
        if err != nil {
                return nil, fmt.Errorf("get routes: %w", err)
        }

        confTemplate := c.config.NetworkPluginConfTemplate
        if confTemplate == "" {
                log.G(ctx).Info("No cni config template is specified, wait for other system components to drop the config.")
                return &runtime.UpdateRuntimeConfigResponse{}, nil
        }
        netPlugin := c.netPlugin[defaultNetworkPlugin]
        if netPlugin == nil {
                log.G(ctx).Infof("Network plugin is ready, skip generating cni config from template %q", confTemplate)
                return &runtime.UpdateRuntimeConfigResponse{}, nil
        }
        netStart := time.Now()
        err = netPlugin.Status()
        networkPluginOperations.WithValues(networkStatusOp).Inc()
        networkPluginOperationsLatency.WithValues(networkStatusOp).UpdateSince(netStart)
        if err == nil {
                log.G(ctx).Infof("Network plugin is ready, skip generating cni config from template %q", confTemplate)
                return &runtime.UpdateRuntimeConfigResponse{}, nil
        }
        networkPluginOperationsErrors.WithValues(networkStatusOp).Inc()
        if err := netPlugin.Load(c.cniLoadOptions()...); err == nil {
                log.G(ctx).Infof("CNI config is successfully loaded, skip generating cni config from template %q", confTemplate)
                return &runtime.UpdateRuntimeConfigResponse{}, nil
        }
        if err := writeCNIConfigFile(ctx, c.config.NetworkPluginConfDir, confTemplate, cidrs[0], cidrs, routes); err != nil {
                return nil, err
        }
        return &runtime.UpdateRuntimeConfigResponse{}, nil
}

// getRoutes generates required routes for the passed in cidrs.
func getRoutes(cidrs []string) ([]string, error) {
        var (
                routes       []string
                hasV4, hasV6 bool
        )
        for _, c := range cidrs {
                _, cidr, err := net.ParseCIDR(c)
                if err != nil {
                        return nil, err
                }
                if cidr.IP.To4() != nil {
                        hasV4 = true
                } else {
                        hasV6 = true
                }
        }
        if hasV4 {
                routes = append(routes, zeroCIDRv4)
        }
        if hasV6 {
                routes = append(routes, zeroCIDRv6)
        }
        return routes, nil
}

func writeCNIConfigFile(ctx context.Context, confDir string, confTemplate string, podCIDR string, podCIDRRanges []string, routes []string) error {
        log.G(ctx).Infof("Generating cni config from template %q", confTemplate)
        // generate cni config file from the template with updated pod cidr.
        t, err := template.ParseFiles(confTemplate)
        if err != nil {
                return fmt.Errorf("failed to parse cni config template %q: %w", confTemplate, err)
        }
        if err := os.MkdirAll(confDir, 0755); err != nil {
                return fmt.Errorf("failed to create cni config directory: %q: %w", confDir, err)
        }
        confFile := filepath.Join(confDir, cniConfigFileName)
        f, err := atomicfile.New(confFile, 0o644)
        defer func() {
                err = f.Close()
        }()
        if err := t.Execute(f, cniConfigTemplate{
                PodCIDR:       podCIDR,
                PodCIDRRanges: podCIDRRanges,
                Routes:        routes,
        }); err != nil {
                return fmt.Errorf("failed to generate cni config file %q: %w", confFile, err)
        }
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package server

import (
        "context"

        "github.com/containerd/containerd/v2/version"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        "github.com/containerd/containerd/v2/internal/cri/constants"
)

const (
        containerName = "containerd"
        // kubeAPIVersion is the api version of kubernetes.
        // TODO(random-liu): Change this to actual CRI version.
        kubeAPIVersion = "0.1.0"
)

// Version returns the runtime name, runtime version and runtime API version.
func (c *criService) Version(ctx context.Context, r *runtime.VersionRequest) (*runtime.VersionResponse, error) {
        return &runtime.VersionResponse{
                Version:           kubeAPIVersion,
                RuntimeName:       containerName,
                RuntimeVersion:    version.Version,
                RuntimeApiVersion: constants.CRIVersion,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package seutil

import (
        "github.com/opencontainers/selinux/go-selinux"
)

// ChangeToKVM process label
func ChangeToKVM(l string) (string, error) {
        if l == "" || !selinux.GetEnabled() {
                return "", nil
        }
        proc, _ := selinux.KVMContainerLabels()
        selinux.ReleaseLabel(proc)

        current, err := selinux.NewContext(l)
        if err != nil {
                return "", err
        }
        next, err := selinux.NewContext(proc)
        if err != nil {
                return "", err
        }
        current["type"] = next["type"]
        return current.Get(), nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package container

import (
        "sync"

        containerd "github.com/containerd/containerd/v2/client"
        cio "github.com/containerd/containerd/v2/internal/cri/io"
        "github.com/containerd/containerd/v2/internal/cri/store"
        "github.com/containerd/containerd/v2/internal/cri/store/label"
        "github.com/containerd/containerd/v2/internal/cri/store/stats"
        "github.com/containerd/containerd/v2/internal/truncindex"
        "github.com/containerd/errdefs"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// Container contains all resources associated with the container. All methods to
// mutate the internal state are thread-safe.
type Container struct {
        // Metadata is the metadata of the container, it is **immutable** after created.
        Metadata
        // Status stores the status of the container.
        Status StatusStorage
        // Container is the containerd container client.
        Container containerd.Container
        // Container IO.
        // IO could only be nil when the container is in unknown state.
        IO *cio.ContainerIO
        // StopCh is used to propagate the stop information of the container.
        *store.StopCh
        // IsStopSignaledWithTimeout the default is 0, and it is set to 1 after sending
        // the signal once to avoid repeated sending of the signal.
        IsStopSignaledWithTimeout *uint32
        // Stats contains (mutable) stats for the container
        Stats *stats.ContainerStats
}

// Opts sets specific information to newly created Container.
type Opts func(*Container) error

// WithContainer adds the containerd Container to the internal data store.
func WithContainer(cntr containerd.Container) Opts {
        return func(c *Container) error {
                c.Container = cntr
                return nil
        }
}

// WithContainerIO adds IO into the container.
func WithContainerIO(io *cio.ContainerIO) Opts {
        return func(c *Container) error {
                c.IO = io
                return nil
        }
}

// WithStatus adds status to the container.
func WithStatus(status Status, root string) Opts {
        return func(c *Container) error {
                s, err := StoreStatus(root, c.ID, status)
                if err != nil {
                        return err
                }
                c.Status = s
                if s.Get().State() == runtime.ContainerState_CONTAINER_EXITED {
                        c.Stop()
                }
                return nil
        }
}

// NewContainer creates an internally used container type.
func NewContainer(metadata Metadata, opts ...Opts) (Container, error) {
        c := Container{
                Metadata:                  metadata,
                StopCh:                    store.NewStopCh(),
                IsStopSignaledWithTimeout: new(uint32),
        }
        for _, o := range opts {
                if err := o(&c); err != nil {
                        return Container{}, err
                }
        }
        return c, nil
}

// Delete deletes checkpoint for the container.
func (c *Container) Delete() error {
        return c.Status.Delete()
}

// Store stores all Containers.
type Store struct {
        lock       sync.RWMutex
        containers map[string]Container
        idIndex    *truncindex.TruncIndex
        labels     *label.Store
}

// NewStore creates a container store.
func NewStore(labels *label.Store) *Store {
        return &Store{
                containers: make(map[string]Container),
                idIndex:    truncindex.NewTruncIndex([]string{}),
                labels:     labels,
        }
}

// Add a container into the store. Returns errdefs.ErrAlreadyExists if the
// container already exists.
func (s *Store) Add(c Container) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        if _, ok := s.containers[c.ID]; ok {
                return errdefs.ErrAlreadyExists
        }
        if err := s.labels.Reserve(c.ProcessLabel); err != nil {
                return err
        }
        if err := s.idIndex.Add(c.ID); err != nil {
                return err
        }
        s.containers[c.ID] = c
        return nil
}

// Get returns the container with specified id. Returns errdefs.ErrNotFound
// if the container doesn't exist.
func (s *Store) Get(id string) (Container, error) {
        s.lock.RLock()
        defer s.lock.RUnlock()
        id, err := s.idIndex.Get(id)
        if err != nil {
                if err == truncindex.ErrNotExist {
                        err = errdefs.ErrNotFound
                }
                return Container{}, err
        }
        if c, ok := s.containers[id]; ok {
                return c, nil
        }
        return Container{}, errdefs.ErrNotFound
}

// List lists all containers.
func (s *Store) List() []Container {
        s.lock.RLock()
        defer s.lock.RUnlock()
        var containers []Container
        for _, c := range s.containers {
                containers = append(containers, c)
        }
        return containers
}

// UpdateContainerStats updates the container specified by ID with the
// stats present in 'newContainerStats'. Returns errdefs.ErrNotFound
// if the container does not exist in the store.
func (s *Store) UpdateContainerStats(id string, newContainerStats *stats.ContainerStats) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        id, err := s.idIndex.Get(id)
        if err != nil {
                if err == truncindex.ErrNotExist {
                        err = errdefs.ErrNotFound
                }
                return err
        }

        if _, ok := s.containers[id]; !ok {
                return errdefs.ErrNotFound
        }

        c := s.containers[id]
        c.Stats = newContainerStats
        s.containers[id] = c
        return nil
}

// Delete deletes the container from store with specified id.
func (s *Store) Delete(id string) {
        s.lock.Lock()
        defer s.lock.Unlock()
        id, err := s.idIndex.Get(id)
        if err != nil {
                // Note: The idIndex.Delete and delete doesn't handle truncated index.
                // So we need to return if there are error.
                return
        }
        c := s.containers[id]
        if c.IO != nil {
                c.IO.Close()
        }
        s.labels.Release(c.ProcessLabel)
        s.idIndex.Delete(id)
        delete(s.containers, id)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package container

import "sync"

// WithFakeStatus adds fake status to the container.
func WithFakeStatus(status Status) Opts {
        return func(c *Container) error {
                c.Status = &fakeStatusStorage{status: status}
                if status.FinishedAt != 0 {
                        // Fake the TaskExit event
                        c.Stop()
                }
                return nil
        }
}

// fakeStatusStorage is a fake status storage for testing.
type fakeStatusStorage struct {
        sync.RWMutex
        status Status
}

func (f *fakeStatusStorage) Get() Status {
        f.RLock()
        defer f.RUnlock()
        return f.status
}

func (f *fakeStatusStorage) UpdateSync(u UpdateFunc) error {
        return f.Update(u)
}

func (f *fakeStatusStorage) Update(u UpdateFunc) error {
        f.Lock()
        defer f.Unlock()
        newStatus, err := u(f.status)
        if err != nil {
                return err
        }
        f.status = newStatus
        return nil
}

func (f *fakeStatusStorage) Delete() error {
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package container

import (
        "encoding/json"
        "fmt"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// NOTE(random-liu):
// 1) Metadata is immutable after created.
// 2) Metadata is checkpointed as containerd container label.

// metadataVersion is current version of container metadata.
const metadataVersion = "v1"

// versionedMetadata is the internal versioned container metadata.
type versionedMetadata struct {
        // Version indicates the version of the versioned container metadata.
        Version string
        // Metadata's type is metadataInternal. If not there will be a recursive call in MarshalJSON.
        Metadata metadataInternal
}

// metadataInternal is for internal use.
type metadataInternal Metadata

// Metadata is the unversioned container metadata.
type Metadata struct {
        // ID is the container id.
        ID string
        // Name is the container name.
        Name string
        // SandboxID is the sandbox id the container belongs to.
        SandboxID string
        // Config is the CRI container config.
        // NOTE(random-liu): Resource limits are updatable, the source
        // of truth for resource limits are in containerd.
        Config *runtime.ContainerConfig
        // ImageRef is the reference of image used by the container.
        ImageRef string
        // LogPath is the container log path.
        LogPath string
        // StopSignal is the system call signal that will be sent to the container to exit.
        // TODO(random-liu): Add integration test for stop signal.
        StopSignal string
        // ProcessLabel is the SELinux process label for the container
        ProcessLabel string
}

// MarshalJSON encodes Metadata into bytes in json format.
func (c *Metadata) MarshalJSON() ([]byte, error) {
        return json.Marshal(&versionedMetadata{
                Version:  metadataVersion,
                Metadata: metadataInternal(*c),
        })
}

// UnmarshalJSON decodes Metadata from bytes.
func (c *Metadata) UnmarshalJSON(data []byte) error {
        versioned := &versionedMetadata{}
        if err := json.Unmarshal(data, versioned); err != nil {
                return err
        }
        // Handle old version after upgrade.
        switch versioned.Version {
        case metadataVersion:
                *c = Metadata(versioned.Metadata)
                return nil
        }
        return fmt.Errorf("unsupported version: %q", versioned.Version)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package container

import (
        "encoding/json"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "sync"

        "github.com/containerd/continuity"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// The container state machine in the CRI plugin:
//
//                         +              +
//                         |              |
//                         | Create       | Load
//                         |              |
//                    +----v----+         |
//                    |         |         |
//                    | CREATED <---------+-----------+
//                    |         |         |           |
//                    +----+-----         |           |
//                         |              |           |
//                         | Start        |           |
//                         |              |           |
//                    +----v----+         |           |
//   Exec    +--------+         |         |           |
//  Attach   |        | RUNNING <---------+           |
// LogReopen +-------->         |         |           |
//                    +----+----+         |           |
//                         |              |           |
//                         | Stop/Exit    |           |
//                         |              |           |
//                    +----v----+         |           |
//                    |         <---------+      +----v----+
//                    |  EXITED |                |         |
//                    |         <----------------+ UNKNOWN |
//                    +----+----+       Stop     |         |
//                         |                     +---------+
//                         | Remove
//                         v
//                      DELETED

// statusVersion is current version of container status.
const statusVersion = "v1"

// versionedStatus is the internal used versioned container status.
type versionedStatus struct {
        // Version indicates the version of the versioned container status.
        Version string
        Status
}

// Status is the status of a container.
type Status struct {
        // Pid is the init process id of the container.
        Pid uint32
        // CreatedAt is the created timestamp.
        CreatedAt int64
        // StartedAt is the started timestamp.
        StartedAt int64
        // FinishedAt is the finished timestamp.
        FinishedAt int64
        // ExitCode is the container exit code.
        ExitCode int32
        // CamelCase string explaining why container is in its current state.
        Reason string
        // Human-readable message indicating details about why container is in its
        // current state.
        Message string
        // Starting indicates that the container is in starting state.
        // This field doesn't need to be checkpointed.
        Starting bool `json:"-"`
        // Removing indicates that the container is in removing state.
        // This field doesn't need to be checkpointed.
        Removing bool `json:"-"`
        // Unknown indicates that the container status is not fully loaded.
        // This field doesn't need to be checkpointed.
        Unknown bool `json:"-"`
        // Resources has container runtime resource constraints
        Resources *runtime.ContainerResources
}

// State returns current state of the container based on the container status.
func (s Status) State() runtime.ContainerState {
        if s.Unknown {
                return runtime.ContainerState_CONTAINER_UNKNOWN
        }
        if s.FinishedAt != 0 {
                return runtime.ContainerState_CONTAINER_EXITED
        }
        if s.StartedAt != 0 {
                return runtime.ContainerState_CONTAINER_RUNNING
        }
        if s.CreatedAt != 0 {
                return runtime.ContainerState_CONTAINER_CREATED
        }
        return runtime.ContainerState_CONTAINER_UNKNOWN
}

// encode encodes Status into bytes in json format.
func (s *Status) encode() ([]byte, error) {
        return json.Marshal(&versionedStatus{
                Version: statusVersion,
                Status:  *s,
        })
}

// decode decodes Status from bytes.
func (s *Status) decode(data []byte) error {
        versioned := &versionedStatus{}
        if err := json.Unmarshal(data, versioned); err != nil {
                return err
        }
        // Handle old version after upgrade.
        switch versioned.Version {
        case statusVersion:
                *s = versioned.Status
                return nil
        }
        return errors.New("unsupported version")
}

// UpdateFunc is function used to update the container status. If there
// is an error, the update will be rolled back.
type UpdateFunc func(Status) (Status, error)

// StatusStorage manages the container status with a storage backend.
type StatusStorage interface {
        // Get a container status.
        Get() Status
        // UpdateSync updates the container status and the on disk checkpoint.
        // Note that the update MUST be applied in one transaction.
        UpdateSync(UpdateFunc) error
        // Update the container status. Note that the update MUST be applied
        // in one transaction.
        Update(UpdateFunc) error
        // Delete the container status.
        // Note:
        // * Delete should be idempotent.
        // * The status must be deleted in one transaction.
        Delete() error
}

// StoreStatus creates the storage containing the passed in container status with the
// specified id.
// The status MUST be created in one transaction.
func StoreStatus(root, id string, status Status) (StatusStorage, error) {
        data, err := status.encode()
        if err != nil {
                return nil, fmt.Errorf("failed to encode status: %w", err)
        }
        path := filepath.Join(root, "status")
        if err := continuity.AtomicWriteFile(path, data, 0600); err != nil {
                return nil, fmt.Errorf("failed to checkpoint status to %q: %w", path, err)
        }
        return &statusStorage{
                path:   path,
                status: status,
        }, nil
}

// LoadStatus loads container status from checkpoint. There shouldn't be threads
// writing to the file during loading.
func LoadStatus(root, id string) (Status, error) {
        path := filepath.Join(root, "status")
        data, err := os.ReadFile(path)
        if err != nil {
                return Status{}, fmt.Errorf("failed to read status from %q: %w", path, err)
        }
        var status Status
        if err := status.decode(data); err != nil {
                return Status{}, fmt.Errorf("failed to decode status %q: %w", data, err)
        }
        return status, nil
}

type statusStorage struct {
        sync.RWMutex
        path   string
        status Status
}

// Get a copy of container status.
func (s *statusStorage) Get() Status {
        s.RLock()
        defer s.RUnlock()
        // Deep copy is needed in case some fields in Status are updated after Get()
        // is called.
        return deepCopyOf(s.status)
}

func deepCopyOf(s Status) Status {
        copy := s
        // Resources is the only field that is a pointer, and therefore needs
        // a manual deep copy.
        // This will need updates when new fields are added to ContainerResources.
        if s.Resources == nil {
                return copy
        }
        copy.Resources = &runtime.ContainerResources{}
        if s.Resources != nil && s.Resources.Linux != nil {
                hugepageLimits := make([]*runtime.HugepageLimit, 0, len(s.Resources.Linux.HugepageLimits))
                for _, l := range s.Resources.Linux.HugepageLimits {
                        if l != nil {
                                hugepageLimits = append(hugepageLimits, &runtime.HugepageLimit{
                                        PageSize: l.PageSize,
                                        Limit:    l.Limit,
                                })
                        }
                }
                copy.Resources = &runtime.ContainerResources{
                        Linux: &runtime.LinuxContainerResources{
                                CpuPeriod:              s.Resources.Linux.CpuPeriod,
                                CpuQuota:               s.Resources.Linux.CpuQuota,
                                CpuShares:              s.Resources.Linux.CpuShares,
                                CpusetCpus:             s.Resources.Linux.CpusetCpus,
                                CpusetMems:             s.Resources.Linux.CpusetMems,
                                MemoryLimitInBytes:     s.Resources.Linux.MemoryLimitInBytes,
                                MemorySwapLimitInBytes: s.Resources.Linux.MemorySwapLimitInBytes,
                                OomScoreAdj:            s.Resources.Linux.OomScoreAdj,
                                Unified:                s.Resources.Linux.Unified,
                                HugepageLimits:         hugepageLimits,
                        },
                }
        }

        if s.Resources != nil && s.Resources.Windows != nil {
                copy.Resources = &runtime.ContainerResources{
                        Windows: &runtime.WindowsContainerResources{
                                CpuShares:          s.Resources.Windows.CpuShares,
                                CpuCount:           s.Resources.Windows.CpuCount,
                                CpuMaximum:         s.Resources.Windows.CpuMaximum,
                                MemoryLimitInBytes: s.Resources.Windows.MemoryLimitInBytes,
                                RootfsSizeInBytes:  s.Resources.Windows.RootfsSizeInBytes,
                        },
                }
        }
        return copy
}

// UpdateSync updates the container status and the on disk checkpoint.
func (s *statusStorage) UpdateSync(u UpdateFunc) error {
        s.Lock()
        defer s.Unlock()
        newStatus, err := u(s.status)
        if err != nil {
                return err
        }
        data, err := newStatus.encode()
        if err != nil {
                return fmt.Errorf("failed to encode status: %w", err)
        }
        if err := continuity.AtomicWriteFile(s.path, data, 0600); err != nil {
                return fmt.Errorf("failed to checkpoint status to %q: %w", s.path, err)
        }
        s.status = newStatus
        return nil
}

// Update the container status.
func (s *statusStorage) Update(u UpdateFunc) error {
        s.Lock()
        defer s.Unlock()
        newStatus, err := u(s.status)
        if err != nil {
                return err
        }
        s.status = newStatus
        return nil
}

// Delete deletes the container status from disk atomically.
func (s *statusStorage) Delete() error {
        temp := filepath.Dir(s.path) + ".del-" + filepath.Base(s.path)
        if err := os.Rename(s.path, temp); err != nil && !os.IsNotExist(err) {
                return err
        }
        return os.RemoveAll(temp)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package image

import (
        "fmt"

        "github.com/containerd/platforms"
)

// NewFakeStore returns an image store with predefined images.
// Update is not allowed for this fake store.
func NewFakeStore(images []Image) (*Store, error) {
        s := NewStore(nil, nil, platforms.Default())
        for _, i := range images {
                for _, ref := range i.References {
                        s.refCache[ref] = i.ID
                }
                if err := s.store.add(i); err != nil {
                        return nil, fmt.Errorf("add image %+v: %w", i, err)
                }
        }
        return s, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package image

import (
        "context"
        "encoding/json"
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/images/usage"
        "github.com/containerd/containerd/v2/internal/cri/labels"
        "github.com/containerd/containerd/v2/internal/cri/util"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
        docker "github.com/distribution/reference"
        "k8s.io/apimachinery/pkg/util/sets"

        imagedigest "github.com/opencontainers/go-digest"
        "github.com/opencontainers/go-digest/digestset"
        imageidentity "github.com/opencontainers/image-spec/identity"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Image contains all resources associated with the image. All fields
// MUST not be mutated directly after created.
type Image struct {
        // Id of the image. Normally the digest of image config.
        ID string
        // References are references to the image, e.g. RepoTag and RepoDigest.
        References []string
        // ChainID is the chainID of the image.
        ChainID string
        // Size is the compressed size of the image.
        Size int64
        // ImageSpec is the oci image structure which describes basic information about the image.
        ImageSpec imagespec.Image
        // Pinned image to prevent it from garbage collection
        Pinned bool
}

// Getter is used to get images but does not make changes
type Getter interface {
        Get(ctx context.Context, name string) (images.Image, error)
}

// Store stores all images.
type Store struct {
        lock sync.RWMutex
        // refCache is a containerd image reference to image id cache.
        refCache map[string]string

        // images is the local image store
        images Getter

        // content provider
        provider content.InfoReaderProvider

        // platform represents the currently supported platform for images
        // TODO: Make this store multi-platform
        platform platforms.MatchComparer

        // store is the internal image store indexed by image id.
        store *store
}

// NewStore creates an image store.
func NewStore(img Getter, provider content.InfoReaderProvider, platform platforms.MatchComparer) *Store {
        return &Store{
                refCache: make(map[string]string),
                images:   img,
                provider: provider,
                platform: platform,
                store: &store{
                        images:     make(map[string]Image),
                        digestSet:  digestset.NewSet(),
                        pinnedRefs: make(map[string]sets.Set[string]),
                },
        }
}

// Update updates cache for a reference.
func (s *Store) Update(ctx context.Context, ref string) error {
        s.lock.Lock()
        defer s.lock.Unlock()

        i, err := s.images.Get(ctx, ref)
        if err != nil && !errdefs.IsNotFound(err) {
                return fmt.Errorf("get image from containerd: %w", err)
        }

        var img *Image
        if err == nil {
                img, err = s.getImage(ctx, i)
                if err != nil {
                        return fmt.Errorf("get image info from containerd: %w", err)
                }
        }
        return s.update(ref, img)
}

// update updates the internal cache. img == nil means that
// the image does not exist in containerd.
func (s *Store) update(ref string, img *Image) error {
        oldID, oldExist := s.refCache[ref]
        if img == nil {
                // The image reference doesn't exist in containerd.
                if oldExist {
                        // Remove the reference from the store.
                        s.store.delete(oldID, ref)
                        delete(s.refCache, ref)
                }
                return nil
        }
        if oldExist {
                if oldID == img.ID {
                        if s.store.isPinned(img.ID, ref) == img.Pinned {
                                return nil
                        }
                        if img.Pinned {
                                return s.store.pin(img.ID, ref)
                        }
                        return s.store.unpin(img.ID, ref)
                }
                // Updated. Remove tag from old image.
                s.store.delete(oldID, ref)
        }
        // New image. Add new image.
        s.refCache[ref] = img.ID
        return s.store.add(*img)
}

// getImage gets image information from containerd for current platform.
func (s *Store) getImage(ctx context.Context, i images.Image) (*Image, error) {
        diffIDs, err := i.RootFS(ctx, s.provider, s.platform)
        if err != nil {
                return nil, fmt.Errorf("get image diffIDs: %w", err)
        }
        chainID := imageidentity.ChainID(diffIDs)

        size, err := usage.CalculateImageUsage(ctx, i, s.provider, usage.WithManifestLimit(s.platform, 1), usage.WithManifestUsage())
        if err != nil {
                return nil, fmt.Errorf("get image compressed resource size: %w", err)
        }

        desc, err := i.Config(ctx, s.provider, s.platform)
        if err != nil {
                return nil, fmt.Errorf("get image config descriptor: %w", err)
        }
        id := desc.Digest.String()

        blob, err := content.ReadBlob(ctx, s.provider, desc)
        if err != nil {
                return nil, fmt.Errorf("read image config from content store: %w", err)
        }

        var spec imagespec.Image
        if err := json.Unmarshal(blob, &spec); err != nil {
                return nil, fmt.Errorf("unmarshal image config %s: %w", blob, err)
        }

        pinned := i.Labels[labels.PinnedImageLabelKey] == labels.PinnedImageLabelValue

        return &Image{
                ID:         id,
                References: []string{i.Name},
                ChainID:    chainID.String(),
                Size:       size,
                ImageSpec:  spec,
                Pinned:     pinned,
        }, nil

}

// Resolve resolves a image reference to image id.
func (s *Store) Resolve(ref string) (string, error) {
        s.lock.RLock()
        defer s.lock.RUnlock()
        id, ok := s.refCache[ref]
        if !ok {
                return "", errdefs.ErrNotFound
        }
        return id, nil
}

// Get gets image metadata by image id. The id can be truncated.
// Returns various validation errors if the image id is invalid.
// Returns errdefs.ErrNotFound if the image doesn't exist.
func (s *Store) Get(id string) (Image, error) {
        return s.store.get(id)
}

// List lists all images.
func (s *Store) List() []Image {
        return s.store.list()
}

type store struct {
        lock       sync.RWMutex
        images     map[string]Image
        digestSet  *digestset.Set
        pinnedRefs map[string]sets.Set[string]
}

func (s *store) list() []Image {
        s.lock.RLock()
        defer s.lock.RUnlock()
        var images []Image
        for _, i := range s.images {
                images = append(images, i)
        }
        return images
}

func (s *store) add(img Image) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        if _, err := s.digestSet.Lookup(img.ID); err != nil {
                if err != digestset.ErrDigestNotFound {
                        return err
                }
                if err := s.digestSet.Add(imagedigest.Digest(img.ID)); err != nil {
                        return err
                }
        }

        if img.Pinned {
                if refs := s.pinnedRefs[img.ID]; refs == nil {
                        s.pinnedRefs[img.ID] = sets.New(img.References...)
                } else {
                        refs.Insert(img.References...)
                }
        }

        i, ok := s.images[img.ID]
        if !ok {
                // If the image doesn't exist, add it.
                s.images[img.ID] = img
                return nil
        }
        // Or else, merge and sort the references.
        i.References = docker.Sort(util.MergeStringSlices(i.References, img.References))
        i.Pinned = i.Pinned || img.Pinned
        s.images[img.ID] = i
        return nil
}

func (s *store) isPinned(id, ref string) bool {
        s.lock.RLock()
        defer s.lock.RUnlock()
        digest, err := s.digestSet.Lookup(id)
        if err != nil {
                return false
        }
        refs := s.pinnedRefs[digest.String()]
        return refs != nil && refs.Has(ref)
}

func (s *store) pin(id, ref string) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        digest, err := s.digestSet.Lookup(id)
        if err != nil {
                if err == digestset.ErrDigestNotFound {
                        err = errdefs.ErrNotFound
                }
                return err
        }
        i, ok := s.images[digest.String()]
        if !ok {
                return errdefs.ErrNotFound
        }

        if refs := s.pinnedRefs[digest.String()]; refs == nil {
                s.pinnedRefs[digest.String()] = sets.New(ref)
        } else {
                refs.Insert(ref)
        }
        i.Pinned = true
        s.images[digest.String()] = i
        return nil
}

func (s *store) unpin(id, ref string) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        digest, err := s.digestSet.Lookup(id)
        if err != nil {
                if err == digestset.ErrDigestNotFound {
                        err = errdefs.ErrNotFound
                }
                return err
        }
        i, ok := s.images[digest.String()]
        if !ok {
                return errdefs.ErrNotFound
        }

        refs := s.pinnedRefs[digest.String()]
        if refs == nil {
                return nil
        }
        if refs.Delete(ref); len(refs) > 0 {
                return nil
        }

        // delete unpinned image, we only need to keep the pinned
        // entries in the map
        delete(s.pinnedRefs, digest.String())
        i.Pinned = false
        s.images[digest.String()] = i
        return nil
}

func (s *store) get(id string) (Image, error) {
        s.lock.RLock()
        defer s.lock.RUnlock()
        digest, err := s.digestSet.Lookup(id)
        if err != nil {
                if err == digestset.ErrDigestNotFound {
                        err = errdefs.ErrNotFound
                }
                return Image{}, err
        }
        if i, ok := s.images[digest.String()]; ok {
                return i, nil
        }
        return Image{}, errdefs.ErrNotFound
}

func (s *store) delete(id, ref string) {
        s.lock.Lock()
        defer s.lock.Unlock()
        digest, err := s.digestSet.Lookup(id)
        if err != nil {
                // Note: The idIndex.Delete and delete doesn't handle truncated index.
                // So we need to return if there are error.
                return
        }
        i, ok := s.images[digest.String()]
        if !ok {
                return
        }
        i.References = util.SubtractStringSlice(i.References, ref)
        if len(i.References) != 0 {
                if refs := s.pinnedRefs[digest.String()]; refs != nil {
                        if refs.Delete(ref); len(refs) == 0 {
                                i.Pinned = false
                                // delete unpinned image, we only need to keep the pinned
                                // entries in the map
                                delete(s.pinnedRefs, digest.String())
                        }
                }

                s.images[digest.String()] = i
                return
        }
        // Remove the image if it is not referenced any more.
        s.digestSet.Remove(digest)
        delete(s.images, digest.String())
        delete(s.pinnedRefs, digest.String())
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package label

import (
        "sync"

        "github.com/opencontainers/selinux/go-selinux"
)

// Store is used to store SELinux process labels
type Store struct {
        sync.Mutex
        levels   map[string]int
        Releaser func(string)
        Reserver func(string)
}

// NewStore creates a new SELinux process label store
func NewStore() *Store {
        return &Store{
                levels:   map[string]int{},
                Releaser: selinux.ReleaseLabel,
                Reserver: selinux.ReserveLabel,
        }
}

// Reserve reserves the MLS/MCS level component of the specified label
// and prevents multiple reserves for the same level
func (s *Store) Reserve(label string) error {
        s.Lock()
        defer s.Unlock()

        context, err := selinux.NewContext(label)
        if err != nil {
                return err
        }

        level := context["level"]
        // no reason to count empty
        if level == "" {
                return nil
        }

        if _, ok := s.levels[level]; !ok {
                s.Reserver(label)
        }

        s.levels[level]++
        return nil
}

// Release un-reserves the MLS/MCS level component of the specified label,
// allowing it to be used by another process once labels with the same
// level have been released.
func (s *Store) Release(label string) {
        s.Lock()
        defer s.Unlock()

        context, err := selinux.NewContext(label)
        if err != nil {
                return
        }

        level := context["level"]
        if level == "" {
                return
        }

        count, ok := s.levels[level]
        if !ok {
                return
        }
        switch {
        case count == 1:
                s.Releaser(label)
                delete(s.levels, level)
        case count < 1:
                delete(s.levels, level)
        case count > 1:
                s.levels[level] = count - 1
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "encoding/json"
        "fmt"

        cni "github.com/containerd/go-cni"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// NOTE(random-liu):
// 1) Metadata is immutable after created.
// 2) Metadata is checkpointed as containerd container label.

// metadataVersion is current version of sandbox metadata.
const metadataVersion = "v1"

// versionedMetadata is the internal versioned sandbox metadata.
type versionedMetadata struct {
        // Version indicates the version of the versioned sandbox metadata.
        Version string
        // Metadata's type is metadataInternal. If not there will be a recursive call in MarshalJSON.
        Metadata metadataInternal
}

// metadataInternal is for internal use.
type metadataInternal Metadata

// Metadata is the unversioned sandbox metadata.
type Metadata struct {
        // ID is the sandbox id.
        ID string
        // Name is the sandbox name.
        Name string
        // Config is the CRI sandbox config.
        Config *runtime.PodSandboxConfig
        // NetNSPath is the network namespace used by the sandbox.
        NetNSPath string
        // IP of Pod if it is attached to non host network
        IP string
        // AdditionalIPs of the Pod if it is attached to non host network
        AdditionalIPs []string
        // RuntimeHandler is the runtime handler name of the pod.
        RuntimeHandler string
        // CNIresult resulting configuration for attached network namespace interfaces
        CNIResult *cni.Result
        // ProcessLabel is the SELinux process label for the container
        ProcessLabel string
}

// MarshalJSON encodes Metadata into bytes in json format.
func (c *Metadata) MarshalJSON() ([]byte, error) {
        return json.Marshal(&versionedMetadata{
                Version:  metadataVersion,
                Metadata: metadataInternal(*c),
        })
}

// UnmarshalJSON decodes Metadata from bytes.
func (c *Metadata) UnmarshalJSON(data []byte) error {
        versioned := &versionedMetadata{}
        if err := json.Unmarshal(data, versioned); err != nil {
                return err
        }
        // Handle old version after upgrade.
        switch versioned.Version {
        case metadataVersion:
                *c = Metadata(versioned.Metadata)
                return nil
        }
        return fmt.Errorf("unsupported version: %q", versioned.Version)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "sync"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/internal/cri/store"
        "github.com/containerd/containerd/v2/internal/cri/store/label"
        "github.com/containerd/containerd/v2/internal/cri/store/stats"
        "github.com/containerd/containerd/v2/internal/truncindex"
        "github.com/containerd/containerd/v2/pkg/netns"
        "github.com/containerd/errdefs"
)

// Sandbox contains all resources associated with the sandbox. All methods to
// mutate the internal state are thread safe.
type Sandbox struct {
        // Metadata is the metadata of the sandbox, it is immutable after created.
        Metadata
        // Status stores the status of the sandbox.
        Status StatusStorage
        // Container is the containerd sandbox container client.
        Container containerd.Container
        // Sandboxer is the sandbox controller name of the sandbox
        Sandboxer string
        // CNI network namespace client.
        // For hostnetwork pod, this is always nil;
        // For non hostnetwork pod, this should never be nil.
        NetNS *netns.NetNS
        // StopCh is used to propagate the stop information of the sandbox.
        *store.StopCh
        // Stats contains (mutable) stats for the (pause) sandbox container
        Stats *stats.ContainerStats
        // Endpoint is the sandbox endpoint, for task or streaming api connection
        Endpoint Endpoint
}

type Endpoint struct {
        Address string
        Version uint32
}

func (e *Endpoint) IsValid() bool {
        return e.Address != ""
}

// NewSandbox creates an internally used sandbox type. This functions reminds
// the caller that a sandbox must have a status.
func NewSandbox(metadata Metadata, status Status) Sandbox {
        s := Sandbox{
                Metadata: metadata,
                Status:   StoreStatus(status),
                StopCh:   store.NewStopCh(),
        }
        if status.State == StateNotReady {
                s.Stop()
        }
        return s
}

// Store stores all sandboxes.
type Store struct {
        lock      sync.RWMutex
        sandboxes map[string]Sandbox
        idIndex   *truncindex.TruncIndex
        labels    *label.Store
}

// NewStore creates a sandbox store.
func NewStore(labels *label.Store) *Store {
        return &Store{
                sandboxes: make(map[string]Sandbox),
                idIndex:   truncindex.NewTruncIndex([]string{}),
                labels:    labels,
        }
}

// Add a sandbox into the store. Returns errdefs.ErrAlreadyExists if the sandbox is
// already stored.
func (s *Store) Add(sb Sandbox) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        if _, ok := s.sandboxes[sb.ID]; ok {
                return errdefs.ErrAlreadyExists
        }
        if err := s.labels.Reserve(sb.ProcessLabel); err != nil {
                return err
        }
        if err := s.idIndex.Add(sb.ID); err != nil {
                return err
        }
        s.sandboxes[sb.ID] = sb
        return nil
}

// Get returns the sandbox with specified id.
// Returns errdefs.ErrNotFound if the sandbox doesn't exist.
func (s *Store) Get(id string) (Sandbox, error) {
        s.lock.RLock()
        defer s.lock.RUnlock()
        id, err := s.idIndex.Get(id)
        if err != nil {
                if err == truncindex.ErrNotExist {
                        err = errdefs.ErrNotFound
                }
                return Sandbox{}, err
        }
        if sb, ok := s.sandboxes[id]; ok {
                return sb, nil
        }
        return Sandbox{}, errdefs.ErrNotFound
}

// List lists all sandboxes.
func (s *Store) List() []Sandbox {
        s.lock.RLock()
        defer s.lock.RUnlock()
        var sandboxes []Sandbox
        for _, sb := range s.sandboxes {
                sandboxes = append(sandboxes, sb)
        }
        return sandboxes
}

// UpdateContainerStats updates the sandbox specified by ID with the
// stats present in 'newContainerStats'. Returns errdefs.ErrNotFound
// if the sandbox does not exist in the store.
func (s *Store) UpdateContainerStats(id string, newContainerStats *stats.ContainerStats) error {
        s.lock.Lock()
        defer s.lock.Unlock()
        id, err := s.idIndex.Get(id)
        if err != nil {
                if err == truncindex.ErrNotExist {
                        err = errdefs.ErrNotFound
                }
                return err
        }

        if _, ok := s.sandboxes[id]; !ok {
                return errdefs.ErrNotFound
        }

        c := s.sandboxes[id]
        c.Stats = newContainerStats
        s.sandboxes[id] = c
        return nil
}

// Delete deletes the sandbox with specified id.
func (s *Store) Delete(id string) {
        s.lock.Lock()
        defer s.lock.Unlock()
        id, err := s.idIndex.Get(id)
        if err != nil {
                // Note: The idIndex.Delete and delete doesn't handle truncated index.
                // So we need to return if there are error.
                return
        }
        s.labels.Release(s.sandboxes[id].ProcessLabel)
        s.idIndex.Delete(id)
        delete(s.sandboxes, id)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "strconv"
        "sync"
        "time"

        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

// The sandbox state machine in the CRI plugin:
//                    +              +
//                    |              |
//                    | Create(Run)  | Load
//                    |              |
//                    |              |
//                    |              |    Start
//                    |              |(failed and not cleaned)
//      Start         |--------------|--------------+
//(failed but cleaned)|              |              |
// +------------------+              |-----------+  |
// |                  | Start(Run)   |           |  |
// |                  |              |           |  |
// | PortForward +----v----+         |           |  |
// |      +------+         |         |           |  |
// |      |      |  READY  <---------+           |  |
// |      +------>         |         |           |  |
// |             +----+----+         |           |  |
// |                  |              |           |  |
// |                  | Stop/Exit    |           |  |
// |                  |              |           |  |
// |             +----v----+         |           |  |
// |             |         <---------+      +----v--v-+
// |             | NOTREADY|                |         |
// |             |         <----------------+ UNKNOWN |
// |             +----+----+       Stop     |         |
// |                  |                     +---------+
// |                  | Remove
// |                  v
// +-------------> DELETED

// State is the sandbox state we use in containerd/cri.
// It includes unknown, which is internal states not defined in CRI.
// The state mapping from internal states to CRI states:
// * ready -> ready
// * not ready -> not ready
// * unknown -> not ready
type State uint32

const (
        // StateReady is ready state, it means sandbox container
        // is running.
        StateReady State = iota
        // StateNotReady is notready state, it ONLY means sandbox
        // container is not running.
        // StopPodSandbox should still be called for NOTREADY sandbox to
        // cleanup resources other than sandbox container, e.g. network namespace.
        // This is an assumption made in CRI.
        StateNotReady
        // StateUnknown is unknown state. Sandbox only goes
        // into unknown state when its status fails to be loaded.
        StateUnknown
)

// String returns the string representation of the state
func (s State) String() string {
        switch s {
        case StateReady:
                return runtime.PodSandboxState_SANDBOX_READY.String()
        case StateNotReady:
                return runtime.PodSandboxState_SANDBOX_NOTREADY.String()
        case StateUnknown:
                // PodSandboxState doesn't have an unknown state, but State does, so return a string using the same convention
                return "SANDBOX_UNKNOWN"
        default:
                return "invalid sandbox state value: " + strconv.Itoa(int(s))
        }
}

// Status is the status of a sandbox.
type Status struct {
        // Pid is the init process id of the sandbox container.
        Pid uint32
        // CreatedAt is the created timestamp.
        CreatedAt time.Time
        // ExitedAt is the stop timestamp
        ExitedAt time.Time
        // ExitStatus is the stop sandbox status
        ExitStatus uint32
        // State is the state of the sandbox.
        State State
}

// UpdateFunc is function used to update the sandbox status. If there
// is an error, the update will be rolled back.
type UpdateFunc func(Status) (Status, error)

// StatusStorage manages the sandbox status.
// The status storage for sandbox is different from container status storage,
// because we don't checkpoint sandbox status. If we need checkpoint in the
// future, we should combine this with container status storage.
type StatusStorage interface {
        // Get a sandbox status.
        Get() Status
        // Update the sandbox status. Note that the update MUST be applied
        // in one transaction.
        Update(UpdateFunc) error
}

// StoreStatus creates the storage containing the passed in sandbox status with the
// specified id.
// The status MUST be created in one transaction.
func StoreStatus(status Status) StatusStorage {
        return &statusStorage{status: status}
}

type statusStorage struct {
        sync.RWMutex
        status Status
}

// Get a copy of sandbox status.
func (s *statusStorage) Get() Status {
        s.RLock()
        defer s.RUnlock()
        return s.status
}

// Update the sandbox status.
func (s *statusStorage) Update(u UpdateFunc) error {
        s.Lock()
        defer s.Unlock()
        newStatus, err := u(s.status)
        if err != nil {
                return err
        }
        s.status = newStatus
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package snapshot

import (
        "sync"

        snapshot "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/errdefs"
)

type Key struct {
        // Key is the key of the snapshot
        Key string
        // Snapshotter is the name of the snapshotter managing the snapshot
        Snapshotter string
}

// Snapshot contains the information about the snapshot.
type Snapshot struct {
        // Key is the key of the snapshot
        Key Key
        // Kind is the kind of the snapshot (active, committed, view)
        Kind snapshot.Kind
        // Size is the size of the snapshot in bytes.
        Size uint64
        // Inodes is the number of inodes used by the snapshot
        Inodes uint64
        // Timestamp is latest update time (in nanoseconds) of the snapshot
        // information.
        Timestamp int64
}

// Store stores all snapshots.
type Store struct {
        lock      sync.RWMutex
        snapshots map[Key]Snapshot
}

// NewStore creates a snapshot store.
func NewStore() *Store {
        return &Store{snapshots: make(map[Key]Snapshot)}
}

// Add a snapshot into the store.
func (s *Store) Add(snapshot Snapshot) {
        s.lock.Lock()
        defer s.lock.Unlock()
        s.snapshots[snapshot.Key] = snapshot
}

// Get returns the snapshot with specified key. Returns errdefs.ErrNotFound if the
// snapshot doesn't exist.
func (s *Store) Get(key Key) (Snapshot, error) {
        s.lock.RLock()
        defer s.lock.RUnlock()
        if sn, ok := s.snapshots[key]; ok {
                return sn, nil
        }
        return Snapshot{}, errdefs.ErrNotFound
}

// List lists all snapshots.
func (s *Store) List() []Snapshot {
        s.lock.RLock()
        defer s.lock.RUnlock()
        var snapshots []Snapshot
        for _, sn := range s.snapshots {
                snapshots = append(snapshots, sn)
        }
        return snapshots
}

// Delete deletes the snapshot with specified key.
func (s *Store) Delete(key Key) {
        s.lock.Lock()
        defer s.lock.Unlock()
        delete(s.snapshots, key)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package store

import "sync"

// StopCh is used to propagate the stop information of a container.
type StopCh struct {
        ch   chan struct{}
        once sync.Once
}

// NewStopCh creates a stop channel. The channel is open by default.
func NewStopCh() *StopCh {
        return &StopCh{ch: make(chan struct{})}
}

// Stop close stopCh of the container.
func (s *StopCh) Stop() {
        s.once.Do(func() {
                close(s.ch)
        })
}

// Stopped return the stopCh of the container as a readonly channel.
func (s *StopCh) Stopped() <-chan struct{} {
        return s.ch
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package systemd

import (
        "os"
        "sync"
)

var (
        runningSystemd bool
        detectSystemd  sync.Once
)

// IsRunningSystemd checks whether the host was booted with systemd as its init
// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
// checks whether /run/systemd/system/ exists and is a directory.
// https://github.com/coreos/go-systemd/blob/d843340ab4bd3815fda02e648f9b09ae2dc722a7/util/util.go#L68-L78
func IsRunningSystemd() bool {
        detectSystemd.Do(func() {
                fi, err := os.Lstat("/run/systemd/system")
                if err != nil {
                        return
                }
                runningSystemd = fi.IsDir()
        })
        return runningSystemd
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package util

import (
        "encoding/json"
        "errors"
        "fmt"
)

// DeepCopy makes a deep copy from src into dst.
func DeepCopy(dst interface{}, src interface{}) error {
        if dst == nil {
                return errors.New("dst cannot be nil")
        }
        if src == nil {
                return errors.New("src cannot be nil")
        }
        bytes, err := json.Marshal(src)
        if err != nil {
                return fmt.Errorf("unable to marshal src: %w", err)
        }
        err = json.Unmarshal(bytes, dst)
        if err != nil {
                return fmt.Errorf("unable to unmarshal into dst: %w", err)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package util

import (
        "crypto/rand"
        "encoding/hex"
)

// GenerateID generates a random unique id.
func GenerateID() string {
        b := make([]byte, 32)
        rand.Read(b)
        return hex.EncodeToString(b)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package util

import reference "github.com/distribution/reference"

// ParseImageReferences parses a list of arbitrary image references and returns
// the repotags and repodigests
func ParseImageReferences(refs []string) ([]string, []string) {
        var tags, digests []string
        for _, ref := range refs {
                parsed, err := reference.ParseAnyReference(ref)
                if err != nil {
                        continue
                }
                if _, ok := parsed.(reference.Canonical); ok {
                        digests = append(digests, parsed.String())
                } else if _, ok := parsed.(reference.Tagged); ok {
                        tags = append(tags, parsed.String())
                }
        }
        return tags, digests
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package util

import (
        "strings"

        "k8s.io/apimachinery/pkg/util/sets"
)

// InStringSlice checks whether a string is inside a string slice.
// Comparison is case insensitive.
func InStringSlice(ss []string, str string) bool {
        for _, s := range ss {
                if strings.EqualFold(s, str) {
                        return true
                }
        }
        return false
}

// SubtractStringSlice subtracts string from string slice.
// Comparison is case insensitive.
func SubtractStringSlice(ss []string, str string) []string {
        var res []string
        for _, s := range ss {
                if strings.EqualFold(s, str) {
                        continue
                }
                res = append(res, s)
        }
        return res
}

// MergeStringSlices merges 2 string slices into one and remove duplicated elements.
func MergeStringSlices(a []string, b []string) []string {
        set := sets.NewString(a...)
        set.Insert(b...)
        return set.UnsortedList()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package util

import (
        "context"
        "time"

        "github.com/containerd/containerd/v2/pkg/namespaces"

        "github.com/containerd/containerd/v2/internal/cri/constants"
)

// deferCleanupTimeout is the default timeout for containerd cleanup operations
// in defer.
const deferCleanupTimeout = 1 * time.Minute

// DeferContext returns a context for containerd cleanup operations in defer.
// A default timeout is applied to avoid cleanup operation pending forever.
func DeferContext() (context.Context, context.CancelFunc) {
        return context.WithTimeout(NamespacedContext(), deferCleanupTimeout)
}

// NamespacedContext returns a context with kubernetes namespace set.
func NamespacedContext() context.Context {
        return WithNamespace(context.Background())
}

// WithNamespace adds kubernetes namespace to the context.
func WithNamespace(ctx context.Context) context.Context {
        return namespaces.WithNamespace(ctx, constants.K8sContainerdNamespace)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package eventq

import (
        "io"
        "time"
)

type EventQueue[T any] struct {
        events      chan<- T
        subscriberC chan<- eventSubscription[T]
        shutdownC   chan struct{}
}

type eventSubscription[T any] struct {
        c      chan<- T
        closeC chan struct{}
}

func (sub eventSubscription[T]) publish(event T) bool {
        select {
        case <-sub.closeC:
                return false
        case sub.c <- event:
                return true
        }
}

func (sub eventSubscription[T]) Close() error {
        select {
        case <-sub.closeC:
        default:
                close(sub.closeC)
        }
        return nil
}

// New provides a queue for sending messages to one or more
// subscribers. Messages are held for the given discardAfter duration
// if there are no subscribers.
func New[T any](discardAfter time.Duration, discardFn func(T)) EventQueue[T] {
        events := make(chan T)
        subscriberC := make(chan eventSubscription[T])
        shutdownC := make(chan struct{})

        go func() {
                type queuedEvent struct {
                        event     T
                        discardAt time.Time
                }

                var discardQueue []queuedEvent
                var discardTime <-chan time.Time
                var subscribers []eventSubscription[T]
                for {
                        select {
                        case <-shutdownC:
                                for _, event := range discardQueue {
                                        discardFn(event.event)
                                }
                                for _, sub := range subscribers {
                                        close(sub.c)
                                }
                                return
                        case event := <-events:
                                if len(subscribers) > 0 {
                                        active := subscribers[:0]
                                        for _, sub := range subscribers {
                                                if sub.publish(event) {
                                                        active = append(active, sub)
                                                }
                                        }
                                        subscribers = active
                                }
                                if len(subscribers) == 0 {
                                        discardQueue = append(discardQueue, queuedEvent{
                                                event:     event,
                                                discardAt: time.Now().Add(discardAfter),
                                        })
                                        if discardTime == nil {
                                                discardTime = time.After(time.Until(discardQueue[0].discardAt).Abs())
                                        }
                                }
                        case s := <-subscriberC:
                                var closed bool
                                for i, event := range discardQueue {
                                        if !s.publish(event.event) {
                                                discardQueue = discardQueue[i:]
                                                closed = true
                                                break
                                        }
                                }
                                if !closed {
                                        discardQueue = nil
                                        discardTime = nil
                                        subscribers = append(subscribers, s)
                                }
                        case t := <-discardTime:
                                toDiscard := discardQueue
                                discardQueue = nil
                                for i, event := range toDiscard {
                                        if t.After(event.discardAt) {
                                                discardFn(event.event)
                                        } else {
                                                discardQueue = toDiscard[i:]
                                                break
                                        }
                                }
                                if len(discardQueue) == 0 {
                                        discardTime = nil
                                } else {
                                        // Wait until next item in discard queue plus a small buffer to collect a burst of events
                                        discardTime = time.After(time.Until(discardQueue[0].discardAt).Abs() + 10*time.Millisecond)
                                }
                        }

                }
        }()

        return EventQueue[T]{
                events:      events,
                subscriberC: subscriberC,
                shutdownC:   shutdownC,
        }
}

func (eq *EventQueue[T]) Shutdown() {
        defer close(eq.shutdownC)
        eq.shutdownC <- struct{}{}
}

func (eq *EventQueue[T]) Send(event T) {
        select {
        case <-eq.shutdownC:
        case eq.events <- event:
        }
}

func (eq *EventQueue[T]) Subscribe() (<-chan T, io.Closer) {
        c := make(chan T, 100)
        subscription := eventSubscription[T]{
                c:      c,
                closeC: make(chan struct{}),
        }
        eq.subscriberC <- subscription

        return c, subscription
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package kmutex provides synchronization primitives to lock/unlock resource by unique key.
package kmutex

import (
        "context"
        "fmt"
        "sync"

        "golang.org/x/sync/semaphore"
)

// KeyedLocker is the interface for acquiring locks based on string.
type KeyedLocker interface {
        Lock(ctx context.Context, key string) error
        Unlock(key string)
}

func New() KeyedLocker {
        return newKeyMutex()
}

func newKeyMutex() *keyMutex {
        return &keyMutex{
                locks: make(map[string]*klock),
        }
}

type keyMutex struct {
        mu sync.Mutex

        locks map[string]*klock
}

type klock struct {
        *semaphore.Weighted
        ref int
}

func (km *keyMutex) Lock(ctx context.Context, key string) error {
        km.mu.Lock()

        l, ok := km.locks[key]
        if !ok {
                km.locks[key] = &klock{
                        Weighted: semaphore.NewWeighted(1),
                }
                l = km.locks[key]
        }
        l.ref++
        km.mu.Unlock()

        if err := l.Acquire(ctx, 1); err != nil {
                km.mu.Lock()
                defer km.mu.Unlock()

                l.ref--

                if l.ref < 0 {
                        panic(fmt.Errorf("kmutex: release of unlocked key %v", key))
                }

                if l.ref == 0 {
                        delete(km.locks, key)
                }
                return err
        }
        return nil
}

func (km *keyMutex) Unlock(key string) {
        km.mu.Lock()
        defer km.mu.Unlock()

        l, ok := km.locks[key]
        if !ok {
                panic(fmt.Errorf("kmutex: unlock of unlocked key %v", key))
        }
        l.Release(1)

        l.ref--

        if l.ref < 0 {
                panic(fmt.Errorf("kmutex: released of unlocked key %v", key))
        }

        if l.ref == 0 {
                delete(km.locks, key)
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package kmutex

import "context"

func NewNoop() KeyedLocker {
        return &noopMutex{}
}

type noopMutex struct {
}

func (*noopMutex) Lock(_ context.Context, _ string) error {
        return nil
}

func (*noopMutex) Unlock(_ string) {
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        "github.com/containerd/containerd/v2/internal/tomlext"
        nri "github.com/containerd/nri/pkg/adaptation"
)

// Config data for NRI.
type Config struct {
        // Disable this NRI plugin and containerd NRI functionality altogether.
        Disable bool `toml:"disable" json:"disable"`
        // SocketPath is the path to the NRI socket to create for NRI plugins to connect to.
        SocketPath string `toml:"socket_path" json:"socketPath"`
        // PluginPath is the path to search for NRI plugins to launch on startup.
        PluginPath string `toml:"plugin_path" json:"pluginPath"`
        // PluginConfigPath is the path to search for plugin-specific configuration.
        PluginConfigPath string `toml:"plugin_config_path" json:"pluginConfigPath"`
        // PluginRegistrationTimeout is the timeout for plugin registration.
        PluginRegistrationTimeout tomlext.Duration `toml:"plugin_registration_timeout" json:"pluginRegistrationTimeout"`
        // PluginRequestTimeout is the timeout for a plugin to handle a request.
        PluginRequestTimeout tomlext.Duration `toml:"plugin_request_timeout" json:"pluginRequestTimeout"`
        // DisableConnections disables connections from externally launched plugins.
        DisableConnections bool `toml:"disable_connections" json:"disableConnections"`
}

// DefaultConfig returns the default configuration.
func DefaultConfig() *Config {
        return &Config{
                Disable:          false,
                SocketPath:       nri.DefaultSocketPath,
                PluginPath:       nri.DefaultPluginPath,
                PluginConfigPath: nri.DefaultPluginConfigPath,

                PluginRegistrationTimeout: tomlext.FromStdTime(nri.DefaultPluginRegistrationTimeout),
                PluginRequestTimeout:      tomlext.FromStdTime(nri.DefaultPluginRequestTimeout),
        }
}

// toOptions returns NRI options for this configuration.
func (c *Config) toOptions() []nri.Option {
        opts := []nri.Option{}
        if c.SocketPath != "" {
                opts = append(opts, nri.WithSocketPath(c.SocketPath))
        }
        if c.PluginPath != "" {
                opts = append(opts, nri.WithPluginPath(c.PluginPath))
        }
        if c.PluginConfigPath != "" {
                opts = append(opts, nri.WithPluginConfigPath(c.PluginConfigPath))
        }
        if c.DisableConnections {
                opts = append(opts, nri.WithDisabledExternalConnections())
        }
        return opts
}

// ConfigureTimeouts sets timeout options for NRI.
func (c *Config) ConfigureTimeouts() {
        if c.PluginRegistrationTimeout != 0 {
                nri.SetPluginRegistrationTimeout(tomlext.ToStdTime(c.PluginRegistrationTimeout))
        }
        if c.PluginRequestTimeout != 0 {
                nri.SetPluginRequestTimeout(tomlext.ToStdTime(c.PluginRequestTimeout))
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        nri "github.com/containerd/nri/pkg/adaptation"
)

// Container interface for interacting with NRI.
type Container interface {
        GetDomain() string

        GetPodSandboxID() string
        GetID() string
        GetName() string
        GetState() nri.ContainerState
        GetLabels() map[string]string
        GetAnnotations() map[string]string
        GetArgs() []string
        GetEnv() []string
        GetMounts() []*nri.Mount
        GetHooks() *nri.Hooks
        GetLinuxContainer() LinuxContainer

        GetPid() uint32
}

type LinuxContainer interface {
        GetLinuxNamespaces() []*nri.LinuxNamespace
        GetLinuxDevices() []*nri.LinuxDevice
        GetLinuxResources() *nri.LinuxResources
        GetOOMScoreAdj() *int
        GetCgroupsPath() string
}

func commonContainerToNRI(ctr Container) *nri.Container {
        return &nri.Container{
                Id:           ctr.GetID(),
                PodSandboxId: ctr.GetPodSandboxID(),
                Name:         ctr.GetName(),
                State:        ctr.GetState(),
                Labels:       ctr.GetLabels(),
                Annotations:  ctr.GetAnnotations(),
                Args:         ctr.GetArgs(),
                Env:          ctr.GetEnv(),
                Mounts:       ctr.GetMounts(),
                Hooks:        ctr.GetHooks(),
                Pid:          ctr.GetPid(),
        }
}

func containersToNRI(ctrList []Container) []*nri.Container {
        ctrs := []*nri.Container{}
        for _, ctr := range ctrList {
                ctrs = append(ctrs, containerToNRI(ctr))
        }
        return ctrs
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        nri "github.com/containerd/nri/pkg/adaptation"
)

func containerToNRI(ctr Container) *nri.Container {
        nriCtr := commonContainerToNRI(ctr)
        lnxCtr := ctr.GetLinuxContainer()
        nriCtr.Linux = &nri.LinuxContainer{
                Namespaces:  lnxCtr.GetLinuxNamespaces(),
                Devices:     lnxCtr.GetLinuxDevices(),
                Resources:   lnxCtr.GetLinuxResources(),
                OomScoreAdj: nri.Int(lnxCtr.GetOOMScoreAdj()),
                CgroupsPath: lnxCtr.GetCgroupsPath(),
        }
        return nriCtr
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        "context"
        "fmt"
        "sync"

        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        nri "github.com/containerd/nri/pkg/adaptation"
)

// Domain implements the functions the generic NRI interface needs to
// deal with pods and containers from a particular containerd namespace.
type Domain interface {
        // GetName returns the containerd namespace for this domain.
        GetName() string

        // ListPodSandboxes lists all pods in this namespace.
        ListPodSandboxes() []PodSandbox

        // ListContainers lists all containers in this namespace.
        ListContainers() []Container

        // GetPodSandbox returns the pod for the given ID.
        GetPodSandbox(string) (PodSandbox, bool)

        // GetContainer returns the container for the given ID.
        GetContainer(string) (Container, bool)

        // UpdateContainer applies an NRI container update request in the namespace.
        UpdateContainer(context.Context, *nri.ContainerUpdate) error

        // EvictContainer evicts the requested container in the namespace.
        EvictContainer(context.Context, *nri.ContainerEviction) error
}

// RegisterDomain registers an NRI domain for a containerd namespace.
func RegisterDomain(d Domain) {
        err := domains.add(d)
        if err != nil {
                log.L.WithError(err).Fatalf("Failed to register namespace %q with NRI", d.GetName())
        }

        log.L.Infof("Registered namespace %q with NRI", d.GetName())
}

type domainTable struct {
        sync.Mutex
        domains map[string]Domain
}

func (t *domainTable) add(d Domain) error {
        t.Lock()
        defer t.Unlock()

        namespace := d.GetName()

        if _, ok := t.domains[namespace]; ok {
                return errdefs.ErrAlreadyExists
        }

        t.domains[namespace] = d
        return nil
}

func (t *domainTable) listPodSandboxes() []PodSandbox {
        var pods []PodSandbox

        t.Lock()
        defer t.Unlock()

        for _, d := range t.domains {
                pods = append(pods, d.ListPodSandboxes()...)
        }
        return pods
}

func (t *domainTable) listContainers() []Container {
        var ctrs []Container

        t.Lock()
        defer t.Unlock()

        for _, d := range t.domains {
                ctrs = append(ctrs, d.ListContainers()...)
        }
        return ctrs
}

func (t *domainTable) getContainer(id string) (Container, Domain) {
        t.Lock()
        defer t.Unlock()

        // TODO(klihub): Are ID conflicts across namespaces possible ? Probably...

        for _, d := range t.domains {
                if ctr, ok := d.GetContainer(id); ok {
                        return ctr, d
                }
        }
        return nil, nil
}

func (t *domainTable) updateContainers(ctx context.Context, updates []*nri.ContainerUpdate) ([]*nri.ContainerUpdate, error) {
        var failed []*nri.ContainerUpdate

        for _, u := range updates {
                _, d := t.getContainer(u.ContainerId)
                if d == nil {
                        continue
                }

                domain := d.GetName()
                err := d.UpdateContainer(namespaces.WithNamespace(ctx, domain), u)
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("NRI update of %s container %s failed",
                                domain, u.ContainerId)
                        if !u.IgnoreFailure {
                                failed = append(failed, u)
                        }
                        continue
                }

                log.G(ctx).Tracef("NRI update of %s container %s successful", domain, u.ContainerId)
        }

        if len(failed) != 0 {
                return failed, fmt.Errorf("NRI update of some containers failed")
        }

        return nil, nil
}

func (t *domainTable) evictContainers(ctx context.Context, evict []*nri.ContainerEviction) ([]*nri.ContainerEviction, error) {
        var failed []*nri.ContainerEviction

        for _, e := range evict {
                _, d := t.getContainer(e.ContainerId)
                if d == nil {
                        continue
                }

                domain := d.GetName()
                err := d.EvictContainer(namespaces.WithNamespace(ctx, domain), e)
                if err != nil {
                        log.G(ctx).WithError(err).Errorf("NRI eviction of %s container %s failed",
                                domain, e.ContainerId)
                        failed = append(failed, e)
                        continue
                }

                log.G(ctx).Tracef("NRI eviction of %s container %s successful", domain, e.ContainerId)
        }

        if len(failed) != 0 {
                return failed, fmt.Errorf("NRI eviction of some containers failed")
        }

        return nil, nil
}

var domains = &domainTable{
        domains: make(map[string]Domain),
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        "context"
        "fmt"
        "path"
        "sync"

        "github.com/containerd/log"

        "github.com/containerd/containerd/v2/version"
        nri "github.com/containerd/nri/pkg/adaptation"
)

// API implements a common API for interfacing NRI from containerd. It is
// agnostic to any internal containerd implementation details of pods and
// containers. It needs corresponding Domain interfaces for each containerd
// namespace it needs to handle. These domains take care of the namespace-
// specific details of providing pod and container metadata to NRI and of
// applying NRI-requested adjustments to the state of containers.
type API interface {
        // IsEnabled returns true if the NRI interface is enabled and initialized.
        IsEnabled() bool

        // Start start the NRI interface, allowing external NRI plugins to
        // connect, register, and hook themselves into the lifecycle events
        // of pods and containers.
        Start() error

        // Stop stops the NRI interface.
        Stop()

        // RunPodSandbox relays pod creation events to NRI.
        RunPodSandbox(context.Context, PodSandbox) error

        // StopPodSandbox relays pod shutdown events to NRI.
        StopPodSandbox(context.Context, PodSandbox) error

        // RemovePodSandbox relays pod removal events to NRI.
        RemovePodSandbox(context.Context, PodSandbox) error

        // CreateContainer relays container creation requests to NRI.
        CreateContainer(context.Context, PodSandbox, Container) (*nri.ContainerAdjustment, error)

        // PostCreateContainer relays successful container creation events to NRI.
        PostCreateContainer(context.Context, PodSandbox, Container) error

        // StartContainer relays container start request notifications to NRI.
        StartContainer(context.Context, PodSandbox, Container) error

        // PostStartContainer relays successful container startup events to NRI.
        PostStartContainer(context.Context, PodSandbox, Container) error

        // UpdateContainer relays container update requests to NRI.
        UpdateContainer(context.Context, PodSandbox, Container, *nri.LinuxResources) (*nri.LinuxResources, error)

        // PostUpdateContainer relays successful container update events to NRI.
        PostUpdateContainer(context.Context, PodSandbox, Container) error

        // StopContainer relays container stop requests to NRI.
        StopContainer(context.Context, PodSandbox, Container) error

        // NotifyContainerExit handles the exit event of a container.
        NotifyContainerExit(context.Context, PodSandbox, Container)

        // RemoveContainer relays container removal events to NRI.
        RemoveContainer(context.Context, PodSandbox, Container) error
}

type State int

const (
        Created State = iota + 1
        Running
        Stopped
        Removed
)

type local struct {
        sync.Mutex
        cfg *Config
        nri *nri.Adaptation

        state map[string]State
}

var _ API = &local{}

// New creates an instance of the NRI interface with the given configuration.
func New(cfg *Config) (API, error) {
        l := &local{
                cfg: cfg,
        }

        if cfg.Disable {
                log.L.Info("NRI interface is disabled by configuration.")
                return l, nil
        }

        var (
                name     = path.Base(version.Package)
                version  = version.Version
                opts     = cfg.toOptions()
                syncFn   = l.syncPlugin
                updateFn = l.updateFromPlugin
                err      error
        )

        cfg.ConfigureTimeouts()

        l.nri, err = nri.New(name, version, syncFn, updateFn, opts...)
        if err != nil {
                return nil, fmt.Errorf("failed to initialize NRI interface: %w", err)
        }

        l.state = make(map[string]State)

        log.L.Info("created NRI interface")

        return l, nil
}

func (l *local) IsEnabled() bool {
        return l != nil && !l.cfg.Disable
}

func (l *local) Start() error {
        if !l.IsEnabled() {
                return nil
        }

        err := l.nri.Start()
        if err != nil {
                return fmt.Errorf("failed to start NRI interface: %w", err)
        }

        return nil
}

func (l *local) Stop() {
        if !l.IsEnabled() {
                return
        }

        l.Lock()
        defer l.Unlock()

        l.nri.Stop()
        l.nri = nil
}

func (l *local) RunPodSandbox(ctx context.Context, pod PodSandbox) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.RunPodSandboxRequest{
                Pod: podSandboxToNRI(pod),
        }

        err := l.nri.RunPodSandbox(ctx, request)
        l.setState(pod.GetID(), Running)
        return err
}

func (l *local) StopPodSandbox(ctx context.Context, pod PodSandbox) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        if !l.needsStopping(pod.GetID()) {
                return nil
        }

        request := &nri.StopPodSandboxRequest{
                Pod: podSandboxToNRI(pod),
        }

        err := l.nri.StopPodSandbox(ctx, request)
        l.setState(pod.GetID(), Stopped)
        return err
}

func (l *local) RemovePodSandbox(ctx context.Context, pod PodSandbox) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        if !l.needsRemoval(pod.GetID()) {
                return nil
        }

        request := &nri.RemovePodSandboxRequest{
                Pod: podSandboxToNRI(pod),
        }

        err := l.nri.RemovePodSandbox(ctx, request)
        l.setState(pod.GetID(), Removed)
        return err
}

func (l *local) CreateContainer(ctx context.Context, pod PodSandbox, ctr Container) (*nri.ContainerAdjustment, error) {
        if !l.IsEnabled() {
                return nil, nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.CreateContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }

        response, err := l.nri.CreateContainer(ctx, request)
        l.setState(request.Container.Id, Created)
        if err != nil {
                return nil, err
        }

        _, err = l.evictContainers(ctx, response.Evict)
        if err != nil {
                // TODO(klihub): we ignore pre-create eviction failures for now
                log.G(ctx).WithError(err).Warnf("pre-create eviction failed")
        }

        if _, err := l.applyUpdates(ctx, response.Update); err != nil {
                // TODO(klihub): we ignore pre-create update failures for now
                log.G(ctx).WithError(err).Warnf("pre-create update failed")
        }

        return response.Adjust, nil
}

func (l *local) PostCreateContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.PostCreateContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }

        return l.nri.PostCreateContainer(ctx, request)
}

func (l *local) StartContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.StartContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }

        err := l.nri.StartContainer(ctx, request)
        l.setState(request.Container.Id, Running)

        return err
}

func (l *local) PostStartContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.PostStartContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }

        return l.nri.PostStartContainer(ctx, request)
}

func (l *local) UpdateContainer(ctx context.Context, pod PodSandbox, ctr Container, req *nri.LinuxResources) (*nri.LinuxResources, error) {
        if !l.IsEnabled() {
                return nil, nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.UpdateContainerRequest{
                Pod:            podSandboxToNRI(pod),
                Container:      containerToNRI(ctr),
                LinuxResources: req,
        }

        response, err := l.nri.UpdateContainer(ctx, request)
        if err != nil {
                return nil, err
        }

        _, err = l.evictContainers(ctx, response.Evict)
        if err != nil {
                // TODO(klihub): we ignore pre-update eviction failures for now
                log.G(ctx).WithError(err).Warnf("pre-update eviction failed")
        }

        cnt := len(response.Update)
        if cnt == 0 {
                return nil, nil
        }

        if cnt > 1 {
                _, err = l.applyUpdates(ctx, response.Update[0:cnt-1])
                if err != nil {
                        // TODO(klihub): we ignore pre-update update failures for now
                        log.G(ctx).WithError(err).Warnf("pre-update update failed")
                }
        }

        return response.Update[cnt-1].GetLinux().GetResources(), nil
}

func (l *local) PostUpdateContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        request := &nri.PostUpdateContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }

        return l.nri.PostUpdateContainer(ctx, request)
}

func (l *local) StopContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        return l.stopContainer(ctx, pod, ctr)
}

func (l *local) NotifyContainerExit(ctx context.Context, pod PodSandbox, ctr Container) {
        go func() {
                l.Lock()
                defer l.Unlock()
                l.stopContainer(ctx, pod, ctr)
        }()
}

func (l *local) stopContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.needsStopping(ctr.GetID()) {
                log.G(ctx).Tracef("NRI stopContainer: container %s does not need stopping",
                        ctr.GetID())
                return nil
        }

        request := &nri.StopContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }

        response, err := l.nri.StopContainer(ctx, request)
        l.setState(request.Container.Id, Stopped)
        if err != nil {
                return err
        }

        _, err = l.applyUpdates(ctx, response.Update)
        if err != nil {
                // TODO(klihub): we ignore post-stop update failures for now
                log.G(ctx).WithError(err).Warnf("post-stop update failed")
        }

        return nil
}

func (l *local) RemoveContainer(ctx context.Context, pod PodSandbox, ctr Container) error {
        if !l.IsEnabled() {
                return nil
        }

        l.Lock()
        defer l.Unlock()

        if !l.needsRemoval(ctr.GetID()) {
                return nil
        }

        l.stopContainer(ctx, pod, ctr)

        request := &nri.RemoveContainerRequest{
                Pod:       podSandboxToNRI(pod),
                Container: containerToNRI(ctr),
        }
        err := l.nri.RemoveContainer(ctx, request)
        l.setState(request.Container.Id, Removed)

        return err
}

func (l *local) syncPlugin(ctx context.Context, syncFn nri.SyncCB) error {
        l.Lock()
        defer l.Unlock()

        log.G(ctx).Info("Synchronizing NRI (plugin) with current runtime state")

        pods := podSandboxesToNRI(domains.listPodSandboxes())
        containers := containersToNRI(domains.listContainers())

        for _, ctr := range containers {
                switch ctr.GetState() {
                case nri.ContainerState_CONTAINER_CREATED:
                        l.setState(ctr.GetId(), Created)
                case nri.ContainerState_CONTAINER_RUNNING, nri.ContainerState_CONTAINER_PAUSED:
                        l.setState(ctr.GetId(), Running)
                case nri.ContainerState_CONTAINER_STOPPED:
                        l.setState(ctr.GetId(), Stopped)
                default:
                        l.setState(ctr.GetId(), Removed)
                }
        }

        updates, err := syncFn(ctx, pods, containers)
        if err != nil {
                return err
        }

        _, err = l.applyUpdates(ctx, updates)
        if err != nil {
                // TODO(klihub): we ignore post-sync update failures for now
                log.G(ctx).WithError(err).Warnf("post-sync update failed")
        }

        return nil
}

func (l *local) updateFromPlugin(ctx context.Context, req []*nri.ContainerUpdate) ([]*nri.ContainerUpdate, error) {
        l.Lock()
        defer l.Unlock()

        log.G(ctx).Trace("Unsolicited NRI container updates")

        failed, err := l.applyUpdates(ctx, req)
        return failed, err
}

func (l *local) applyUpdates(ctx context.Context, updates []*nri.ContainerUpdate) ([]*nri.ContainerUpdate, error) {
        // TODO(klihub): should we pre-save state and attempt a rollback on failure ?
        failed, err := domains.updateContainers(ctx, updates)
        return failed, err
}

func (l *local) evictContainers(ctx context.Context, evict []*nri.ContainerEviction) ([]*nri.ContainerEviction, error) {
        failed, err := domains.evictContainers(ctx, evict)
        return failed, err
}

func (l *local) setState(id string, state State) {
        if state != Removed {
                l.state[id] = state
                return
        }

        delete(l.state, id)
}

func (l *local) getState(id string) State {
        if state, ok := l.state[id]; ok {
                return state
        }

        return Removed
}

func (l *local) needsStopping(id string) bool {
        s := l.getState(id)
        if s == Created || s == Running {
                return true
        }
        return false
}

func (l *local) needsRemoval(id string) bool {
        s := l.getState(id)
        if s == Created || s == Running || s == Stopped {
                return true
        }
        return false
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        nri "github.com/containerd/nri/pkg/adaptation"
)

// PodSandbox interface for interacting with NRI.
type PodSandbox interface {
        GetDomain() string

        GetID() string
        GetName() string
        GetUID() string
        GetNamespace() string
        GetLabels() map[string]string
        GetAnnotations() map[string]string
        GetRuntimeHandler() string
        GetLinuxPodSandbox() LinuxPodSandbox

        GetPid() uint32
}

type LinuxPodSandbox interface {
        GetLinuxNamespaces() []*nri.LinuxNamespace
        GetPodLinuxOverhead() *nri.LinuxResources
        GetPodLinuxResources() *nri.LinuxResources
        GetCgroupParent() string
        GetCgroupsPath() string
        GetLinuxResources() *nri.LinuxResources
}

func commonPodSandboxToNRI(pod PodSandbox) *nri.PodSandbox {
        return &nri.PodSandbox{
                Id:             pod.GetID(),
                Name:           pod.GetName(),
                Uid:            pod.GetUID(),
                Namespace:      pod.GetNamespace(),
                Labels:         pod.GetLabels(),
                Annotations:    pod.GetAnnotations(),
                RuntimeHandler: pod.GetRuntimeHandler(),
                Pid:            pod.GetPid(),
        }
}

func podSandboxesToNRI(podList []PodSandbox) []*nri.PodSandbox {
        pods := []*nri.PodSandbox{}
        for _, pod := range podList {
                pods = append(pods, podSandboxToNRI(pod))
        }
        return pods
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        nri "github.com/containerd/nri/pkg/adaptation"
)

func podSandboxToNRI(pod PodSandbox) *nri.PodSandbox {
        nriPod := commonPodSandboxToNRI(pod)
        lnxPod := pod.GetLinuxPodSandbox()
        nriPod.Linux = &nri.LinuxPodSandbox{
                Namespaces:   lnxPod.GetLinuxNamespaces(),
                PodOverhead:  lnxPod.GetPodLinuxOverhead(),
                PodResources: lnxPod.GetPodLinuxResources(),
                CgroupParent: lnxPod.GetCgroupParent(),
                CgroupsPath:  lnxPod.GetCgroupsPath(),
                Resources:    lnxPod.GetLinuxResources(),
        }
        return nriPod
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package randutil provides utilities for [cyrpto/rand].
package randutil

import (
        "crypto/rand"
        "math"
        "math/big"
)

// Int63n is similar to [math/rand.Int63n] but uses [crypto/rand.Reader] under the hood.
func Int63n(n int64) int64 {
        b, err := rand.Int(rand.Reader, big.NewInt(n))
        if err != nil {
                panic(err)
        }
        return b.Int64()
}

// Int63 is similar to [math/rand.Int63] but uses [crypto/rand.Reader] under the hood.
func Int63() int64 {
        return Int63n(math.MaxInt64)
}

// Intn is similar to [math/rand.Intn] but uses [crypto/rand.Reader] under the hood.
func Intn(n int) int {
        return int(Int63n(int64(n)))
}

// Int is similar to [math/rand.Int] but uses [crypto/rand.Reader] under the hood.
func Int() int {
        return int(Int63())
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package registrar

import (
        "fmt"
        "sync"
)

// Registrar stores one-to-one name<->key mappings.
// Names and keys must be unique.
// Registrar is safe for concurrent access.
type Registrar struct {
        lock      sync.Mutex
        nameToKey map[string]string
        keyToName map[string]string
}

// NewRegistrar creates a new Registrar with the empty indexes.
func NewRegistrar() *Registrar {
        return &Registrar{
                nameToKey: make(map[string]string),
                keyToName: make(map[string]string),
        }
}

// Reserve registers a name<->key mapping, name or key must not
// be empty.
// Reserve is idempotent.
// Attempting to reserve a conflict key<->name mapping results
// in an error.
// A name<->key reservation is globally unique.
func (r *Registrar) Reserve(name, key string) error {
        r.lock.Lock()
        defer r.lock.Unlock()

        if name == "" || key == "" {
                return fmt.Errorf("invalid name %q or key %q", name, key)
        }

        if k, exists := r.nameToKey[name]; exists {
                if k != key {
                        return fmt.Errorf("name %q is reserved for %q", name, k)
                }
                return nil
        }

        if n, exists := r.keyToName[key]; exists {
                if n != name {
                        return fmt.Errorf("key %q is reserved for %q", key, n)
                }
                return nil
        }

        r.nameToKey[name] = key
        r.keyToName[key] = name
        return nil
}

// ReleaseByName releases the reserved name<->key mapping by name.
// Once released, the name and the key can be reserved again.
func (r *Registrar) ReleaseByName(name string) {
        r.lock.Lock()
        defer r.lock.Unlock()

        key, exists := r.nameToKey[name]
        if !exists {
                return
        }

        delete(r.nameToKey, name)
        delete(r.keyToName, key)
}

// ReleaseByKey release the reserved name<->key mapping by key.
func (r *Registrar) ReleaseByKey(key string) {
        r.lock.Lock()
        defer r.lock.Unlock()

        name, exists := r.keyToName[key]
        if !exists {
                return
        }

        delete(r.nameToKey, name)
        delete(r.keyToName, key)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package tomlext

import "time"

type Duration time.Duration

func (d *Duration) UnmarshalText(b []byte) error {
        x, err := time.ParseDuration(string(b))
        if err != nil {
                return err
        }
        *d = Duration(x)
        return nil
}

func (d Duration) MarshalText() (text []byte, err error) {
        return []byte(time.Duration(d).String()), nil
}

func ToStdTime(d Duration) time.Duration {
        return time.Duration(d)
}

func FromStdTime(duration time.Duration) Duration {
        return Duration(duration)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// This file is a copy of moby/moby/pkg/truncindex/truncindex.go

// Package truncindex provides a general 'index tree', used by Docker
// in order to be able to reference containers by only a few unambiguous
// characters of their id.
package truncindex

import (
        "errors"
        "fmt"
        "strings"
        "sync"

        "github.com/tchap/go-patricia/v2/patricia"
)

var (
        // ErrEmptyPrefix is an error returned if the prefix was empty.
        ErrEmptyPrefix = errors.New("prefix can't be empty")

        // ErrIllegalChar is returned when a space is in the ID
        ErrIllegalChar = errors.New("illegal character: ' '")

        // ErrNotExist is returned when ID or its prefix not found in index.
        ErrNotExist = errors.New("ID does not exist")
)

// ErrAmbiguousPrefix is returned if the prefix was ambiguous
// (multiple ids for the prefix).
type ErrAmbiguousPrefix struct {
        prefix string
}

func (e ErrAmbiguousPrefix) Error() string {
        return fmt.Sprintf("Multiple IDs found with provided prefix: %s", e.prefix)
}

// TruncIndex allows the retrieval of string identifiers by any of their unique prefixes.
// This is used to retrieve image and container IDs by more convenient shorthand prefixes.
type TruncIndex struct {
        sync.RWMutex
        trie *patricia.Trie
        ids  map[string]struct{}
}

// NewTruncIndex creates a new TruncIndex and initializes with a list of IDs.
func NewTruncIndex(ids []string) (idx *TruncIndex) {
        idx = &TruncIndex{
                ids: make(map[string]struct{}),

                // Change patricia max prefix per node length,
                // because our len(ID) always 64
                trie: patricia.NewTrie(patricia.MaxPrefixPerNode(64)),
        }
        for _, id := range ids {
                idx.addID(id)
        }
        return
}

func (idx *TruncIndex) addID(id string) error {
        if strings.Contains(id, " ") {
                return ErrIllegalChar
        }
        if id == "" {
                return ErrEmptyPrefix
        }
        if _, exists := idx.ids[id]; exists {
                return fmt.Errorf("id already exists: '%s'", id)
        }
        idx.ids[id] = struct{}{}
        if inserted := idx.trie.Insert(patricia.Prefix(id), struct{}{}); !inserted {
                return fmt.Errorf("failed to insert id: %s", id)
        }
        return nil
}

// Add adds a new ID to the TruncIndex.
func (idx *TruncIndex) Add(id string) error {
        idx.Lock()
        defer idx.Unlock()
        return idx.addID(id)
}

// Delete removes an ID from the TruncIndex. If there are multiple IDs
// with the given prefix, an error is thrown.
func (idx *TruncIndex) Delete(id string) error {
        idx.Lock()
        defer idx.Unlock()
        if _, exists := idx.ids[id]; !exists || id == "" {
                return fmt.Errorf("no such id: '%s'", id)
        }
        delete(idx.ids, id)
        if deleted := idx.trie.Delete(patricia.Prefix(id)); !deleted {
                return fmt.Errorf("no such id: '%s'", id)
        }
        return nil
}

// Get retrieves an ID from the TruncIndex. If there are multiple IDs
// with the given prefix, an error is thrown.
func (idx *TruncIndex) Get(s string) (string, error) {
        if s == "" {
                return "", ErrEmptyPrefix
        }
        var (
                id string
        )
        subTreeVisitFunc := func(prefix patricia.Prefix, item patricia.Item) error {
                if id != "" {
                        // we haven't found the ID if there are two or more IDs
                        id = ""
                        return ErrAmbiguousPrefix{prefix: s}
                }
                id = string(prefix)
                return nil
        }

        idx.RLock()
        defer idx.RUnlock()
        if err := idx.trie.VisitSubtree(patricia.Prefix(s), subTreeVisitFunc); err != nil {
                return "", err
        }
        if id != "" {
                return id, nil
        }
        return "", ErrNotExist
}

// Iterate iterates over all stored IDs and passes each of them to the given
// handler. Take care that the handler method does not call any public
// method on truncindex as the internal locking is not reentrant/recursive
// and will result in deadlock.
func (idx *TruncIndex) Iterate(handler func(id string)) {
        idx.Lock()
        defer idx.Unlock()
        idx.trie.Visit(func(prefix patricia.Prefix, item patricia.Item) error {
                handler(string(prefix))
                return nil
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package apparmor

// HostSupports returns true if apparmor is enabled for the host:
//   - On Linux returns true if apparmor is enabled, apparmor_parser is
//     present, and if we are not running docker-in-docker.
//   - On non-Linux returns false.
//
// This is derived from libcontainer/apparmor.IsEnabled(), with the addition
// of checks for apparmor_parser to be present and docker-in-docker.
func HostSupports() bool {
        return hostSupports()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package apparmor

import (
        "os"
        "sync"
)

var (
        appArmorSupported bool
        checkAppArmor     sync.Once
)

// hostSupports returns true if apparmor is enabled for the host, if
// apparmor_parser is enabled, and if we are not running docker-in-docker.
//
// This is derived from libcontainer/apparmor.IsEnabled(), with the addition
// of checks for apparmor_parser to be present and docker-in-docker.
func hostSupports() bool {
        checkAppArmor.Do(func() {
                // see https://github.com/opencontainers/runc/blob/0d49470392206f40eaab3b2190a57fe7bb3df458/libcontainer/apparmor/apparmor_linux.go
                if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
                        if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
                                buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
                                appArmorSupported = err == nil && len(buf) > 1 && buf[0] == 'Y'
                        }
                }
        })
        return appArmorSupported
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package compression

import (
        "bufio"
        "bytes"
        "compress/gzip"
        "context"
        "encoding/binary"
        "fmt"
        "io"
        "os"
        "os/exec"
        "strconv"
        "sync"

        "github.com/containerd/log"
        "github.com/klauspost/compress/zstd"
)

type (
        // Compression is the state represents if compressed or not.
        Compression int
)

const (
        // Uncompressed represents the uncompressed.
        Uncompressed Compression = iota
        // Gzip is gzip compression algorithm.
        Gzip
        // Zstd is zstd compression algorithm.
        Zstd
)

const (
        disablePigzEnv  = "CONTAINERD_DISABLE_PIGZ"
        disableIgzipEnv = "CONTAINERD_DISABLE_IGZIP"
)

var (
        initGzip sync.Once
        gzipPath string
)

var (
        bufioReader32KPool = &sync.Pool{
                New: func() interface{} { return bufio.NewReaderSize(nil, 32*1024) },
        }
)

// DecompressReadCloser include the stream after decompress and the compress method detected.
type DecompressReadCloser interface {
        io.ReadCloser
        // GetCompression returns the compress method which is used before decompressing
        GetCompression() Compression
}

type readCloserWrapper struct {
        io.Reader
        compression Compression
        closer      func() error
}

func (r *readCloserWrapper) Close() error {
        if r.closer != nil {
                return r.closer()
        }
        return nil
}

func (r *readCloserWrapper) GetCompression() Compression {
        return r.compression
}

type writeCloserWrapper struct {
        io.Writer
        closer func() error
}

func (w *writeCloserWrapper) Close() error {
        if w.closer != nil {
                w.closer()
        }
        return nil
}

type bufferedReader struct {
        buf *bufio.Reader
}

func newBufferedReader(r io.Reader) *bufferedReader {
        buf := bufioReader32KPool.Get().(*bufio.Reader)
        buf.Reset(r)
        return &bufferedReader{buf}
}

func (r *bufferedReader) Read(p []byte) (n int, err error) {
        if r.buf == nil {
                return 0, io.EOF
        }
        n, err = r.buf.Read(p)
        if err == io.EOF {
                r.buf.Reset(nil)
                bufioReader32KPool.Put(r.buf)
                r.buf = nil
        }
        return
}

func (r *bufferedReader) Peek(n int) ([]byte, error) {
        if r.buf == nil {
                return nil, io.EOF
        }
        return r.buf.Peek(n)
}

const (
        zstdMagicSkippableStart = 0x184D2A50
        zstdMagicSkippableMask  = 0xFFFFFFF0
)

var (
        gzipMagic = []byte{0x1F, 0x8B, 0x08}
        zstdMagic = []byte{0x28, 0xb5, 0x2f, 0xfd}
)

type matcher = func([]byte) bool

func magicNumberMatcher(m []byte) matcher {
        return func(source []byte) bool {
                return bytes.HasPrefix(source, m)
        }
}

// zstdMatcher detects zstd compression algorithm.
// There are two frame formats defined by Zstandard: Zstandard frames and Skippable frames.
// See https://datatracker.ietf.org/doc/html/rfc8878#section-3 for more details.
func zstdMatcher() matcher {
        return func(source []byte) bool {
                if bytes.HasPrefix(source, zstdMagic) {
                        // Zstandard frame
                        return true
                }
                // skippable frame
                if len(source) < 8 {
                        return false
                }
                // magic number from 0x184D2A50 to 0x184D2A5F.
                if binary.LittleEndian.Uint32(source[:4])&zstdMagicSkippableMask == zstdMagicSkippableStart {
                        return true
                }
                return false
        }
}

// DetectCompression detects the compression algorithm of the source.
func DetectCompression(source []byte) Compression {
        for compression, fn := range map[Compression]matcher{
                Gzip: magicNumberMatcher(gzipMagic),
                Zstd: zstdMatcher(),
        } {
                if fn(source) {
                        return compression
                }
        }
        return Uncompressed
}

// DecompressStream decompresses the archive and returns a ReaderCloser with the decompressed archive.
func DecompressStream(archive io.Reader) (DecompressReadCloser, error) {
        buf := newBufferedReader(archive)
        bs, err := buf.Peek(10)
        if err != nil && err != io.EOF {
                // Note: we'll ignore any io.EOF error because there are some odd
                // cases where the layer.tar file will be empty (zero bytes) and
                // that results in an io.EOF from the Peek() call. So, in those
                // cases we'll just treat it as a non-compressed stream and
                // that means just create an empty layer.
                // See Issue docker/docker#18170
                return nil, err
        }

        switch compression := DetectCompression(bs); compression {
        case Uncompressed:
                return &readCloserWrapper{
                        Reader:      buf,
                        compression: compression,
                }, nil
        case Gzip:
                ctx, cancel := context.WithCancel(context.Background())
                gzReader, err := gzipDecompress(ctx, buf)
                if err != nil {
                        cancel()
                        return nil, err
                }

                return &readCloserWrapper{
                        Reader:      gzReader,
                        compression: compression,
                        closer: func() error {
                                cancel()
                                return gzReader.Close()
                        },
                }, nil
        case Zstd:
                zstdReader, err := zstd.NewReader(buf)
                if err != nil {
                        return nil, err
                }
                return &readCloserWrapper{
                        Reader:      zstdReader,
                        compression: compression,
                        closer: func() error {
                                zstdReader.Close()
                                return nil
                        },
                }, nil

        default:
                return nil, fmt.Errorf("unsupported compression format %s", (&compression).Extension())
        }
}

// CompressStream compresses the dest with specified compression algorithm.
func CompressStream(dest io.Writer, compression Compression) (io.WriteCloser, error) {
        switch compression {
        case Uncompressed:
                return &writeCloserWrapper{dest, nil}, nil
        case Gzip:
                return gzip.NewWriter(dest), nil
        case Zstd:
                return zstd.NewWriter(dest)
        default:
                return nil, fmt.Errorf("unsupported compression format %s", (&compression).Extension())
        }
}

// Extension returns the extension of a file that uses the specified compression algorithm.
func (compression *Compression) Extension() string {
        switch *compression {
        case Gzip:
                return "gz"
        case Zstd:
                return "zst"
        }
        return ""
}

func gzipDecompress(ctx context.Context, buf io.Reader) (io.ReadCloser, error) {
        initGzip.Do(func() {
                if gzipPath = detectCommand("igzip", disableIgzipEnv); gzipPath != "" {
                        log.L.Debug("using igzip for decompression")
                        return
                }
                if gzipPath = detectCommand("unpigz", disablePigzEnv); gzipPath != "" {
                        log.L.Debug("using unpigz for decompression")
                }
        })

        if gzipPath == "" {
                return gzip.NewReader(buf)
        }
        return cmdStream(exec.CommandContext(ctx, gzipPath, "-d", "-c"), buf)
}

func cmdStream(cmd *exec.Cmd, in io.Reader) (io.ReadCloser, error) {
        reader, writer := io.Pipe()

        cmd.Stdin = in
        cmd.Stdout = writer

        var errBuf bytes.Buffer
        cmd.Stderr = &errBuf

        if err := cmd.Start(); err != nil {
                return nil, err
        }

        go func() {
                if err := cmd.Wait(); err != nil {
                        writer.CloseWithError(fmt.Errorf("%s: %s", err, errBuf.String()))
                } else {
                        writer.Close()
                }
        }()

        return reader, nil
}

func detectCommand(path, disableEnvName string) string {
        // Check if this command is disabled via the env variable
        value := os.Getenv(disableEnvName)
        if value != "" {
                disable, err := strconv.ParseBool(value)
                if err != nil {
                        log.L.WithError(err).Warnf("could not parse %s: %s", disableEnvName, value)
                }

                if disable {
                        return ""
                }
        }

        path, err := exec.LookPath(path)
        if err != nil {
                log.L.WithError(err).Debugf("%s not found", path)
                return ""
        }

        return path
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package compression

import (
        "bytes"
)

func FuzzDecompressStream(data []byte) int {
        _, _ = DecompressStream(bytes.NewReader(data))
        return 1
}

//go:build !freebsd

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import "os"

func link(oldname, newname string) error {
        return os.Link(oldname, newname)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "archive/tar"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "strings"
        "sync"
        "syscall"
        "time"

        "github.com/containerd/containerd/v2/pkg/archive/tarheader"
        "github.com/containerd/containerd/v2/pkg/epoch"
        "github.com/containerd/containerd/v2/pkg/userns"
        "github.com/containerd/continuity/fs"
        "github.com/containerd/log"
)

var bufPool = &sync.Pool{
        New: func() interface{} {
                buffer := make([]byte, 32*1024)
                return &buffer
        },
}

var errInvalidArchive = errors.New("invalid archive")

// Diff returns a tar stream of the computed filesystem
// difference between the provided directories.
//
// Produces a tar using OCI style file markers for deletions. Deleted
// files will be prepended with the prefix ".wh.". This style is
// based off AUFS whiteouts.
// See https://github.com/opencontainers/image-spec/blob/main/layer.md
func Diff(ctx context.Context, a, b string, opts ...WriteDiffOpt) io.ReadCloser {
        r, w := io.Pipe()

        go func() {
                err := WriteDiff(ctx, w, a, b, opts...)
                if err != nil {
                        log.G(ctx).WithError(err).Debugf("write diff failed")
                }
                if err = w.CloseWithError(err); err != nil {
                        log.G(ctx).WithError(err).Debugf("closing tar pipe failed")
                }
        }()

        return r
}

// WriteDiff writes a tar stream of the computed difference between the
// provided paths.
//
// Produces a tar using OCI style file markers for deletions. Deleted
// files will be prepended with the prefix ".wh.". This style is
// based off AUFS whiteouts.
// See https://github.com/opencontainers/image-spec/blob/main/layer.md
func WriteDiff(ctx context.Context, w io.Writer, a, b string, opts ...WriteDiffOpt) error {
        var options WriteDiffOptions
        for _, opt := range opts {
                if err := opt(&options); err != nil {
                        return fmt.Errorf("failed to apply option: %w", err)
                }
        }
        if tm := epoch.FromContext(ctx); tm != nil && options.SourceDateEpoch == nil {
                options.SourceDateEpoch = tm
        }

        if options.writeDiffFunc == nil {
                options.writeDiffFunc = writeDiffNaive
        }

        return options.writeDiffFunc(ctx, w, a, b, options)
}

// writeDiffNaive writes a tar stream of the computed difference between the
// provided directories on disk.
//
// Produces a tar using OCI style file markers for deletions. Deleted
// files will be prepended with the prefix ".wh.". This style is
// based off AUFS whiteouts.
// See https://github.com/opencontainers/image-spec/blob/main/layer.md
func writeDiffNaive(ctx context.Context, w io.Writer, a, b string, o WriteDiffOptions) error {
        var opts []ChangeWriterOpt
        if o.SourceDateEpoch != nil {
                opts = append(opts, WithModTimeUpperBound(*o.SourceDateEpoch))
                // Since containerd v2.0, the whiteout timestamps are set to zero (1970-01-01),
                // not to the source date epoch
        }
        cw := NewChangeWriter(w, b, opts...)
        err := fs.Changes(ctx, a, b, cw.HandleChange)
        if err != nil {
                return fmt.Errorf("failed to create diff tar stream: %w", err)
        }
        return cw.Close()
}

const (
        // whiteoutPrefix prefix means file is a whiteout. If this is followed by a
        // filename this means that file has been removed from the base layer.
        // See https://github.com/opencontainers/image-spec/blob/main/layer.md#whiteouts
        whiteoutPrefix = ".wh."

        // whiteoutMetaPrefix prefix means whiteout has a special meaning and is not
        // for removing an actual file. Normally these files are excluded from exported
        // archives.
        whiteoutMetaPrefix = whiteoutPrefix + whiteoutPrefix

        // whiteoutOpaqueDir file means directory has been made opaque - meaning
        // readdir calls to this directory do not follow to lower layers.
        whiteoutOpaqueDir = whiteoutMetaPrefix + ".opq"

        paxSchilyXattr = "SCHILY.xattr."

        userXattrPrefix = "user."
)

// Apply applies a tar stream of an OCI style diff tar.
// See https://github.com/opencontainers/image-spec/blob/main/layer.md#applying-changesets
func Apply(ctx context.Context, root string, r io.Reader, opts ...ApplyOpt) (int64, error) {
        root = filepath.Clean(root)

        var options ApplyOptions
        for _, opt := range opts {
                if err := opt(&options); err != nil {
                        return 0, fmt.Errorf("failed to apply option: %w", err)
                }
        }
        if options.Filter == nil {
                options.Filter = all
        }
        if options.applyFunc == nil {
                options.applyFunc = applyNaive
        }

        return options.applyFunc(ctx, root, r, options)
}

// applyNaive applies a tar stream of an OCI style diff tar to a directory
// applying each file as either a whole file or whiteout.
// See https://github.com/opencontainers/image-spec/blob/main/layer.md#applying-changesets
func applyNaive(ctx context.Context, root string, r io.Reader, options ApplyOptions) (size int64, err error) {
        var (
                dirs []*tar.Header

                tr = tar.NewReader(r)

                // Used for handling opaque directory markers which
                // may occur out of order
                unpackedPaths = make(map[string]struct{})

                convertWhiteout = options.ConvertWhiteout
        )

        if convertWhiteout == nil {
                // handle whiteouts by removing the target files
                convertWhiteout = func(hdr *tar.Header, path string) (bool, error) {
                        base := filepath.Base(path)
                        dir := filepath.Dir(path)
                        if base == whiteoutOpaqueDir {
                                _, err := os.Lstat(dir)
                                if err != nil {
                                        return false, err
                                }
                                err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
                                        if err != nil {
                                                if os.IsNotExist(err) {
                                                        err = nil // parent was deleted
                                                }
                                                return err
                                        }
                                        if path == dir {
                                                return nil
                                        }
                                        if _, exists := unpackedPaths[path]; !exists {
                                                err := os.RemoveAll(path)
                                                return err
                                        }
                                        return nil
                                })
                                return false, err
                        }

                        if strings.HasPrefix(base, whiteoutPrefix) {
                                originalBase := base[len(whiteoutPrefix):]
                                originalPath := filepath.Join(dir, originalBase)

                                return false, os.RemoveAll(originalPath)
                        }

                        return true, nil
                }
        }

        // Iterate through the files in the archive.
        for {
                select {
                case <-ctx.Done():
                        return 0, ctx.Err()
                default:
                }

                hdr, err := tr.Next()
                if err == io.EOF {
                        // end of tar archive
                        break
                }
                if err != nil {
                        return 0, err
                }

                size += hdr.Size

                // Normalize name, for safety and for a simple is-root check
                hdr.Name = filepath.Clean(hdr.Name)

                accept, err := options.Filter(hdr)
                if err != nil {
                        return 0, err
                }
                if !accept {
                        continue
                }

                if skipFile(hdr) {
                        log.G(ctx).Warnf("file %q ignored: archive may not be supported on system", hdr.Name)
                        continue
                }

                // Split name and resolve symlinks for root directory.
                ppath, base := filepath.Split(hdr.Name)
                ppath, err = fs.RootPath(root, ppath)
                if err != nil {
                        return 0, fmt.Errorf("failed to get root path: %w", err)
                }

                // Join to root before joining to parent path to ensure relative links are
                // already resolved based on the root before adding to parent.
                path := filepath.Join(ppath, filepath.Join("/", base))
                if path == root {
                        log.G(ctx).Debugf("file %q ignored: resolved to root", hdr.Name)
                        continue
                }

                // If file is not directly under root, ensure parent directory
                // exists or is created.
                if ppath != root {
                        parentPath := ppath
                        if base == "" {
                                parentPath = filepath.Dir(path)
                        }
                        if err := mkparent(ctx, parentPath, root, options.Parents); err != nil {
                                return 0, err
                        }
                }

                // Naive whiteout convert function which handles whiteout files by
                // removing the target files.
                if err := validateWhiteout(path); err != nil {
                        return 0, err
                }
                writeFile, err := convertWhiteout(hdr, path)
                if err != nil {
                        return 0, fmt.Errorf("failed to convert whiteout file %q: %w", hdr.Name, err)
                }
                if !writeFile {
                        continue
                }
                // If path exits we almost always just want to remove and replace it.
                // The only exception is when it is a directory *and* the file from
                // the layer is also a directory. Then we want to merge them (i.e.
                // just apply the metadata from the layer).
                if fi, err := os.Lstat(path); err == nil {
                        if !(fi.IsDir() && hdr.Typeflag == tar.TypeDir) {
                                if err := os.RemoveAll(path); err != nil {
                                        return 0, err
                                }
                        }
                }

                srcData := io.Reader(tr)
                srcHdr := hdr

                if err := createTarFile(ctx, path, root, srcHdr, srcData, options.NoSameOwner); err != nil {
                        return 0, err
                }

                // Directory mtimes must be handled at the end to avoid further
                // file creation in them to modify the directory mtime
                if hdr.Typeflag == tar.TypeDir {
                        dirs = append(dirs, hdr)
                }
                unpackedPaths[path] = struct{}{}
        }

        for _, hdr := range dirs {
                path, err := fs.RootPath(root, hdr.Name)
                if err != nil {
                        return 0, err
                }
                if err := chtimes(path, boundTime(latestTime(hdr.AccessTime, hdr.ModTime)), boundTime(hdr.ModTime)); err != nil {
                        return 0, err
                }
        }

        return size, nil
}

func createTarFile(ctx context.Context, path, extractDir string, hdr *tar.Header, reader io.Reader, noSameOwner bool) error {
        // hdr.Mode is in linux format, which we can use for syscalls,
        // but for os.Foo() calls we need the mode converted to os.FileMode,
        // so use hdrInfo.Mode() (they differ for e.g. setuid bits)
        hdrInfo := hdr.FileInfo()

        switch hdr.Typeflag {
        case tar.TypeDir:
                // Create directory unless it exists as a directory already.
                // In that case we just want to merge the two
                if fi, err := os.Lstat(path); !(err == nil && fi.IsDir()) {
                        if err := mkdir(path, hdrInfo.Mode()); err != nil {
                                return err
                        }
                }

        //nolint:staticcheck // TypeRegA is deprecated but we may still receive an external tar with TypeRegA
        case tar.TypeReg, tar.TypeRegA:
                file, err := openFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, hdrInfo.Mode())
                if err != nil {
                        return err
                }

                _, err = copyBuffered(ctx, file, reader)
                if err1 := file.Close(); err == nil {
                        err = err1
                }
                if err != nil {
                        return err
                }

        case tar.TypeBlock, tar.TypeChar:
                // Handle this is an OS-specific way
                if err := handleTarTypeBlockCharFifo(hdr, path); err != nil {
                        return err
                }

        case tar.TypeFifo:
                // Handle this is an OS-specific way
                if err := handleTarTypeBlockCharFifo(hdr, path); err != nil {
                        return err
                }

        case tar.TypeLink:
                targetPath, err := hardlinkRootPath(extractDir, hdr.Linkname)
                if err != nil {
                        return err
                }

                if err := link(targetPath, path); err != nil {
                        return err
                }

        case tar.TypeSymlink:
                if err := os.Symlink(hdr.Linkname, path); err != nil {
                        return err
                }

        case tar.TypeXGlobalHeader:
                log.G(ctx).Debug("PAX Global Extended Headers found and ignored")
                return nil

        default:
                return fmt.Errorf("unhandled tar header type %d", hdr.Typeflag)
        }

        if !noSameOwner {
                if err := os.Lchown(path, hdr.Uid, hdr.Gid); err != nil {
                        err = fmt.Errorf("failed to Lchown %q for UID %d, GID %d: %w", path, hdr.Uid, hdr.Gid, err)
                        if errors.Is(err, syscall.EINVAL) && userns.RunningInUserNS() {
                                err = fmt.Errorf("%w (Hint: try increasing the number of subordinate IDs in /etc/subuid and /etc/subgid)", err)
                        }
                        return err
                }
        }

        for key, value := range hdr.PAXRecords {
                if strings.HasPrefix(key, paxSchilyXattr) {
                        key = key[len(paxSchilyXattr):]
                        if err := setxattr(path, key, value); err != nil {
                                if errors.Is(err, syscall.EPERM) && strings.HasPrefix(key, userXattrPrefix) {
                                        // In the user.* namespace, only regular files and directories can have extended attributes.
                                        // See https://man7.org/linux/man-pages/man7/xattr.7.html for details.
                                        if fi, err := os.Lstat(path); err == nil && (!fi.Mode().IsRegular() && !fi.Mode().IsDir()) {
                                                log.G(ctx).WithError(err).Warnf("ignored xattr %s in archive", key)
                                                continue
                                        }
                                }
                                if errors.Is(err, syscall.ENOTSUP) {
                                        log.G(ctx).WithError(err).Warnf("ignored xattr %s in archive", key)
                                        continue
                                }
                                return fmt.Errorf("failed to setxattr %q for key %q: %w", path, key, err)
                        }
                }
        }

        // call lchmod after lchown since lchown can modify the file mode
        if err := lchmod(path, hdrInfo.Mode()); err != nil {
                return err
        }

        return chtimes(path, boundTime(latestTime(hdr.AccessTime, hdr.ModTime)), boundTime(hdr.ModTime))
}

func mkparent(ctx context.Context, path, root string, parents []string) error {
        if dir, err := os.Lstat(path); err == nil {
                if dir.IsDir() {
                        return nil
                }
                return &os.PathError{
                        Op:   "mkparent",
                        Path: path,
                        Err:  syscall.ENOTDIR,
                }
        } else if !os.IsNotExist(err) {
                return err
        }

        i := len(path)
        for i > len(root) && !os.IsPathSeparator(path[i-1]) {
                i--
        }

        if i > len(root)+1 {
                if err := mkparent(ctx, path[:i-1], root, parents); err != nil {
                        return err
                }
        }

        if err := mkdir(path, 0755); err != nil {
                // Check that still doesn't exist
                dir, err1 := os.Lstat(path)
                if err1 == nil && dir.IsDir() {
                        return nil
                }
                return err
        }

        for _, p := range parents {
                ppath, err := fs.RootPath(p, path[len(root):])
                if err != nil {
                        return err
                }

                dir, err := os.Lstat(ppath)
                if err == nil {
                        if !dir.IsDir() {
                                // Replaced, do not copy attributes
                                break
                        }
                        if err := copyDirInfo(dir, path); err != nil {
                                return err
                        }
                        return copyUpXAttrs(path, ppath)
                } else if !os.IsNotExist(err) {
                        return err
                }
        }

        log.G(ctx).Debugf("parent directory %q not found: default permissions(0755) used", path)

        return nil
}

// ChangeWriter provides tar stream from filesystem change information.
// The privided tar stream is styled as an OCI layer. Change information
// (add/modify/delete/unmodified) for each file needs to be passed to this
// writer through HandleChange method.
//
// This should be used combining with continuity's diff computing functionality
// (e.g. `fs.Change` of github.com/containerd/continuity/fs).
//
// See also https://github.com/opencontainers/image-spec/blob/main/layer.md for details
// about OCI layers
type ChangeWriter struct {
        tw                *tar.Writer
        source            string
        modTimeUpperBound *time.Time
        inodeSrc          map[uint64]string
        inodeRefs         map[uint64][]string
        addedDirs         map[string]struct{}
}

// ChangeWriterOpt can be specified in NewChangeWriter.
type ChangeWriterOpt func(cw *ChangeWriter)

// WithModTimeUpperBound sets the mod time upper bound.
func WithModTimeUpperBound(tm time.Time) ChangeWriterOpt {
        return func(cw *ChangeWriter) {
                cw.modTimeUpperBound = &tm
        }
}

// NewChangeWriter returns ChangeWriter that writes tar stream of the source directory
// to the privided writer. Change information (add/modify/delete/unmodified) for each
// file needs to be passed through HandleChange method.
func NewChangeWriter(w io.Writer, source string, opts ...ChangeWriterOpt) *ChangeWriter {
        cw := &ChangeWriter{
                tw:        tar.NewWriter(w),
                source:    source,
                inodeSrc:  map[uint64]string{},
                inodeRefs: map[uint64][]string{},
                addedDirs: map[string]struct{}{},
        }
        for _, o := range opts {
                o(cw)
        }
        return cw
}

// HandleChange receives filesystem change information and reflect that information to
// the result tar stream. This function implements `fs.ChangeFunc` of continuity
// (github.com/containerd/continuity/fs) and should be used with that package.
func (cw *ChangeWriter) HandleChange(k fs.ChangeKind, p string, f os.FileInfo, err error) error {
        if err != nil {
                return err
        }
        if k == fs.ChangeKindDelete {
                whiteOutDir := filepath.Dir(p)
                whiteOutBase := filepath.Base(p)
                whiteOut := filepath.Join(whiteOutDir, whiteoutPrefix+whiteOutBase)
                // Since containerd v2.0, the whiteout timestamps are set to zero (1970-01-01),
                // not to the source date epoch.
                whiteOutT := time.Unix(0, 0).UTC()
                hdr := &tar.Header{
                        Typeflag:   tar.TypeReg,
                        Name:       whiteOut[1:],
                        Size:       0,
                        ModTime:    whiteOutT,
                        AccessTime: whiteOutT,
                        ChangeTime: whiteOutT,
                }
                if err := cw.includeParents(hdr); err != nil {
                        return err
                }
                if err := cw.tw.WriteHeader(hdr); err != nil {
                        return fmt.Errorf("failed to write whiteout header: %w", err)
                }
        } else {
                var (
                        link   string
                        err    error
                        source = filepath.Join(cw.source, p)
                )

                switch {
                case f.Mode()&os.ModeSocket != 0:
                        return nil // ignore sockets
                case f.Mode()&os.ModeSymlink != 0:
                        if link, err = os.Readlink(source); err != nil {
                                return err
                        }
                }

                // Use FileInfoHeaderNoLookups to avoid propagating user names and group names from the host
                hdr, err := tarheader.FileInfoHeaderNoLookups(f, link)
                if err != nil {
                        return err
                }

                hdr.Mode = int64(chmodTarEntry(os.FileMode(hdr.Mode)))

                // truncate timestamp for compatibility. without PAX stdlib rounds timestamps instead
                hdr.Format = tar.FormatPAX
                if cw.modTimeUpperBound != nil && hdr.ModTime.After(*cw.modTimeUpperBound) {
                        hdr.ModTime = *cw.modTimeUpperBound
                }
                hdr.ModTime = hdr.ModTime.Truncate(time.Second)
                hdr.AccessTime = time.Time{}
                hdr.ChangeTime = time.Time{}

                name := p
                if strings.HasPrefix(name, string(filepath.Separator)) {
                        name, err = filepath.Rel(string(filepath.Separator), name)
                        if err != nil {
                                return fmt.Errorf("failed to make path relative: %w", err)
                        }
                }
                // Canonicalize to POSIX-style paths using forward slashes. Directory
                // entries must end with a slash.
                name = filepath.ToSlash(name)
                if f.IsDir() && !strings.HasSuffix(name, "/") {
                        name += "/"
                }
                hdr.Name = name

                if err := setHeaderForSpecialDevice(hdr, name, f); err != nil {
                        return fmt.Errorf("failed to set device headers: %w", err)
                }

                // additionalLinks stores file names which must be linked to
                // this file when this file is added
                var additionalLinks []string
                inode, isHardlink := fs.GetLinkInfo(f)
                if isHardlink {
                        // If the inode has a source, always link to it
                        if source, ok := cw.inodeSrc[inode]; ok {
                                hdr.Typeflag = tar.TypeLink
                                hdr.Linkname = source
                                hdr.Size = 0
                        } else {
                                if k == fs.ChangeKindUnmodified {
                                        cw.inodeRefs[inode] = append(cw.inodeRefs[inode], name)
                                        return nil
                                }
                                cw.inodeSrc[inode] = name
                                additionalLinks = cw.inodeRefs[inode]
                                delete(cw.inodeRefs, inode)
                        }
                } else if k == fs.ChangeKindUnmodified {
                        // Nothing to write to diff
                        return nil
                }

                if capability, err := getxattr(source, "security.capability"); err != nil {
                        return fmt.Errorf("failed to get capabilities xattr: %w", err)
                } else if len(capability) > 0 {
                        if hdr.PAXRecords == nil {
                                hdr.PAXRecords = map[string]string{}
                        }
                        hdr.PAXRecords[paxSchilyXattr+"security.capability"] = string(capability)
                }

                if err := cw.includeParents(hdr); err != nil {
                        return err
                }
                if err := cw.tw.WriteHeader(hdr); err != nil {
                        return fmt.Errorf("failed to write file header: %w", err)
                }

                if hdr.Typeflag == tar.TypeReg && hdr.Size > 0 {
                        file, err := open(source)
                        if err != nil {
                                return fmt.Errorf("failed to open path: %v: %w", source, err)
                        }
                        defer file.Close()

                        n, err := copyBuffered(context.TODO(), cw.tw, file)
                        if err != nil {
                                return fmt.Errorf("failed to copy: %w", err)
                        }
                        if n != hdr.Size {
                                return errors.New("short write copying file")
                        }
                }

                if additionalLinks != nil {
                        source = hdr.Name
                        for _, extra := range additionalLinks {
                                hdr.Name = extra
                                hdr.Typeflag = tar.TypeLink
                                hdr.Linkname = source
                                hdr.Size = 0

                                if err := cw.includeParents(hdr); err != nil {
                                        return err
                                }
                                if err := cw.tw.WriteHeader(hdr); err != nil {
                                        return fmt.Errorf("failed to write file header: %w", err)
                                }
                        }
                }
        }
        return nil
}

// Close closes this writer.
func (cw *ChangeWriter) Close() error {
        if err := cw.tw.Close(); err != nil {
                return fmt.Errorf("failed to close tar writer: %w", err)
        }
        return nil
}

func (cw *ChangeWriter) includeParents(hdr *tar.Header) error {
        if cw.addedDirs == nil {
                return nil
        }
        name := strings.TrimRight(hdr.Name, "/")
        fname := filepath.Join(cw.source, name)
        parent := filepath.Dir(name)
        pname := filepath.Join(cw.source, parent)

        // Do not include root directory as parent
        if fname != cw.source && pname != cw.source {
                _, ok := cw.addedDirs[parent]
                if !ok {
                        cw.addedDirs[parent] = struct{}{}
                        fi, err := os.Stat(pname)
                        if err != nil {
                                return err
                        }
                        if err := cw.HandleChange(fs.ChangeKindModify, parent, fi, nil); err != nil {
                                return err
                        }
                }
        }
        if hdr.Typeflag == tar.TypeDir {
                cw.addedDirs[name] = struct{}{}
        }
        return nil
}

func copyBuffered(ctx context.Context, dst io.Writer, src io.Reader) (written int64, err error) {
        buf := bufPool.Get().(*[]byte)
        defer bufPool.Put(buf)

        for {
                select {
                case <-ctx.Done():
                        err = ctx.Err()
                        return
                default:
                }

                nr, er := src.Read(*buf)
                if nr > 0 {
                        nw, ew := dst.Write((*buf)[0:nr])
                        if nw > 0 {
                                written += int64(nw)
                        }
                        if ew != nil {
                                err = ew
                                break
                        }
                        if nr != nw {
                                err = io.ErrShortWrite
                                break
                        }
                }
                if er != nil {
                        if er != io.EOF {
                                err = er
                        }
                        break
                }
        }
        return written, err

}

// hardlinkRootPath returns target linkname, evaluating and bounding any
// symlink to the parent directory.
//
// NOTE: Allow hardlink to the softlink, not the real one. For example,
//
//        touch /tmp/zzz
//        ln -s /tmp/zzz /tmp/xxx
//        ln /tmp/xxx /tmp/yyy
//
// /tmp/yyy should be softlink which be same of /tmp/xxx, not /tmp/zzz.
func hardlinkRootPath(root, linkname string) (string, error) {
        ppath, base := filepath.Split(linkname)
        ppath, err := fs.RootPath(root, ppath)
        if err != nil {
                return "", err
        }

        targetPath := filepath.Join(ppath, base)
        if !strings.HasPrefix(targetPath, root) {
                targetPath = root
        }
        return targetPath, nil
}

func validateWhiteout(path string) error {
        base := filepath.Base(path)
        dir := filepath.Dir(path)

        if base == whiteoutOpaqueDir {
                return nil
        }

        if strings.HasPrefix(base, whiteoutPrefix) {
                originalBase := base[len(whiteoutPrefix):]
                originalPath := filepath.Join(dir, originalBase)

                // Ensure originalPath is under dir
                if dir[len(dir)-1] != filepath.Separator {
                        dir += string(filepath.Separator)
                }
                if !strings.HasPrefix(originalPath, dir) {
                        return fmt.Errorf("invalid whiteout name: %v: %w", base, errInvalidArchive)
                }
        }
        return nil
}

//go:build !windows && !freebsd

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "os"

        "golang.org/x/sys/unix"
)

// mknod wraps Unix.Mknod and casts dev to int
func mknod(path string, mode uint32, dev uint64) error {
        return unix.Mknod(path, mode, int(dev))
}

// lsetxattrCreate wraps unix.Lsetxattr, passes the unix.XATTR_CREATE flag on
// supported operating systems,and ignores appropriate errors
func lsetxattrCreate(link string, attr string, data []byte) error {
        err := unix.Lsetxattr(link, attr, data, unix.XATTR_CREATE)
        if err == unix.ENOTSUP || err == unix.ENODATA || err == unix.EEXIST {
                return nil
        }
        return err
}

// lchmod checks for symlink and changes the mode if not a symlink
func lchmod(path string, mode os.FileMode) error {
        fi, err := os.Lstat(path)
        if err != nil {
                return err
        }

        if fi.Mode()&os.ModeSymlink == 0 {
                if err := os.Chmod(path, mode); err != nil {
                        return err
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "archive/tar"
        "context"
        "io"
        "time"
)

// ApplyOptions provides additional options for an Apply operation
type ApplyOptions struct {
        Filter          Filter          // Filter tar headers
        ConvertWhiteout ConvertWhiteout // Convert whiteout files
        Parents         []string        // Parent directories to handle inherited attributes without CoW
        NoSameOwner     bool            // NoSameOwner will not attempt to preserve the owner specified in the tar archive.

        applyFunc func(context.Context, string, io.Reader, ApplyOptions) (int64, error)
}

// ApplyOpt allows setting mutable archive apply properties on creation
type ApplyOpt func(options *ApplyOptions) error

// Filter specific files from the archive
type Filter func(*tar.Header) (bool, error)

// ConvertWhiteout converts whiteout files from the archive
type ConvertWhiteout func(*tar.Header, string) (bool, error)

// all allows all files
func all(_ *tar.Header) (bool, error) {
        return true, nil
}

// WithFilter uses the filter to select which files are to be extracted.
func WithFilter(f Filter) ApplyOpt {
        return func(options *ApplyOptions) error {
                options.Filter = f
                return nil
        }
}

// WithConvertWhiteout uses the convert function to convert the whiteout files.
func WithConvertWhiteout(c ConvertWhiteout) ApplyOpt {
        return func(options *ApplyOptions) error {
                options.ConvertWhiteout = c
                return nil
        }
}

// WithNoSameOwner is same as '--no-same-owner` in 'tar' command.
// It'll skip attempt to preserve the owner specified in the tar archive.
func WithNoSameOwner() ApplyOpt {
        return func(options *ApplyOptions) error {
                options.NoSameOwner = true
                return nil
        }
}

// WithParents provides parent directories for resolving inherited attributes
// directory from the filesystem.
// Inherited attributes are searched from first to last, making the first
// element in the list the most immediate parent directory.
// NOTE: When applying to a filesystem which supports CoW, file attributes
// should be inherited by the filesystem.
func WithParents(p []string) ApplyOpt {
        return func(options *ApplyOptions) error {
                options.Parents = p
                return nil
        }
}

// WriteDiffOptions provides additional options for a WriteDiff operation
type WriteDiffOptions struct {
        ParentLayers []string // Windows needs the full list of parent layers

        writeDiffFunc func(context.Context, io.Writer, string, string, WriteDiffOptions) error

        // SourceDateEpoch specifies the following timestamps to provide control for reproducibility.
        //   - The upper bound timestamp of the diff contents
        //   - The timestamp of the whiteouts
        //
        // See also https://reproducible-builds.org/docs/source-date-epoch/ .
        SourceDateEpoch *time.Time
}

// WriteDiffOpt allows setting mutable archive write properties on creation
type WriteDiffOpt func(options *WriteDiffOptions) error

// WithSourceDateEpoch specifies the SOURCE_DATE_EPOCH without touching the env vars.
func WithSourceDateEpoch(tm *time.Time) WriteDiffOpt {
        return func(options *WriteDiffOptions) error {
                options.SourceDateEpoch = tm
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "archive/tar"
        "os"
        "path/filepath"
        "strings"

        "golang.org/x/sys/unix"
)

// OverlayConvertWhiteout converts whiteout files for overlay.
func OverlayConvertWhiteout(hdr *tar.Header, path string) (bool, error) {
        base := filepath.Base(path)
        dir := filepath.Dir(path)

        // if a directory is marked as opaque, we need to translate that to overlay
        if base == whiteoutOpaqueDir {
                // don't write the file itself
                return false, unix.Setxattr(dir, "trusted.overlay.opaque", []byte{'y'}, 0)
        }

        // if a file was deleted and we are using overlay, we need to create a character device
        if strings.HasPrefix(base, whiteoutPrefix) {
                originalBase := base[len(whiteoutPrefix):]
                originalPath := filepath.Join(dir, originalBase)

                if err := unix.Mknod(originalPath, unix.S_IFCHR, 0); err != nil {
                        return false, err
                }
                // don't write the file itself
                return false, os.Chown(originalPath, hdr.Uid, hdr.Gid)
        }

        return true, nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "archive/tar"
        "errors"
        "fmt"
        "os"
        "runtime"
        "strings"
        "syscall"

        "github.com/containerd/containerd/v2/pkg/userns"
        "github.com/containerd/continuity/fs"
        "github.com/containerd/continuity/sysx"
        "golang.org/x/sys/unix"
)

func chmodTarEntry(perm os.FileMode) os.FileMode {
        return perm
}

func setHeaderForSpecialDevice(hdr *tar.Header, name string, fi os.FileInfo) error {
        // Devmajor and Devminor are only needed for special devices.

        // In FreeBSD, RDev for regular files is -1 (unless overridden by FS):
        // https://cgit.freebsd.org/src/tree/sys/kern/vfs_default.c?h=stable/13#n1531
        // (NODEV is -1: https://cgit.freebsd.org/src/tree/sys/sys/param.h?h=stable/13#n241).

        // ZFS in particular does not override the default:
        // https://cgit.freebsd.org/src/tree/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c?h=stable/13#n2027

        // Since `Stat_t.Rdev` is uint64, the cast turns -1 into (2^64 - 1).
        // Such large values cannot be encoded in a tar header.
        if runtime.GOOS == "freebsd" && hdr.Typeflag != tar.TypeBlock && hdr.Typeflag != tar.TypeChar {
                return nil
        }
        s, ok := fi.Sys().(*syscall.Stat_t)
        if !ok {
                return errors.New("unsupported stat type")
        }

        rdev := uint64(s.Rdev) //nolint:nolintlint,unconvert // rdev is int32 on darwin/bsd, int64 on linux/solaris

        // Currently go does not fill in the major/minors
        if s.Mode&syscall.S_IFBLK != 0 ||
                s.Mode&syscall.S_IFCHR != 0 {
                hdr.Devmajor = int64(unix.Major(rdev))
                hdr.Devminor = int64(unix.Minor(rdev))
        }

        return nil
}

func open(p string) (*os.File, error) {
        return os.Open(p)
}

func openFile(name string, flag int, perm os.FileMode) (*os.File, error) {
        f, err := os.OpenFile(name, flag, perm)
        if err != nil {
                return nil, err
        }
        // Call chmod to avoid permission mask
        if err := os.Chmod(name, perm); err != nil {
                f.Close()
                return nil, err
        }
        return f, err
}

func mkdir(path string, perm os.FileMode) error {
        if err := os.Mkdir(path, perm); err != nil {
                return err
        }
        // Only final created directory gets explicit permission
        // call to avoid permission mask
        return os.Chmod(path, perm)
}

func skipFile(hdr *tar.Header) bool {
        switch hdr.Typeflag {
        case tar.TypeBlock, tar.TypeChar:
                // cannot create a device if running in user namespace
                return userns.RunningInUserNS()
        default:
                return false
        }
}

// handleTarTypeBlockCharFifo is an OS-specific helper function used by
// createTarFile to handle the following types of header: Block; Char; Fifo.
// This function must not be called for Block and Char when running in userns.
// (skipFile() should return true for them.)
func handleTarTypeBlockCharFifo(hdr *tar.Header, path string) error {
        mode := uint32(hdr.Mode & 07777)
        switch hdr.Typeflag {
        case tar.TypeBlock:
                mode |= unix.S_IFBLK
        case tar.TypeChar:
                mode |= unix.S_IFCHR
        case tar.TypeFifo:
                mode |= unix.S_IFIFO
        }

        return mknod(path, mode, unix.Mkdev(uint32(hdr.Devmajor), uint32(hdr.Devminor)))
}

func getxattr(path, attr string) ([]byte, error) {
        b, err := sysx.LGetxattr(path, attr)
        if err == unix.ENOTSUP || err == sysx.ENODATA {
                return nil, nil
        }
        return b, err
}

func setxattr(path, key, value string) error {
        // Do not set trusted attributes
        if strings.HasPrefix(key, "trusted.") {
                return fmt.Errorf("admin attributes from archive not supported: %w", unix.ENOTSUP)
        }
        return unix.Lsetxattr(path, key, []byte(value), 0)
}

func copyDirInfo(fi os.FileInfo, path string) error {
        st := fi.Sys().(*syscall.Stat_t)
        if err := os.Lchown(path, int(st.Uid), int(st.Gid)); err != nil {
                if os.IsPermission(err) {
                        // Normally if uid/gid are the same this would be a no-op, but some
                        // filesystems may still return EPERM... for instance NFS does this.
                        // In such a case, this is not an error.
                        if dstStat, err2 := os.Lstat(path); err2 == nil {
                                st2 := dstStat.Sys().(*syscall.Stat_t)
                                if st.Uid == st2.Uid && st.Gid == st2.Gid {
                                        err = nil
                                }
                        }
                }
                if err != nil {
                        return fmt.Errorf("failed to chown %s: %w", path, err)
                }
        }

        if err := os.Chmod(path, fi.Mode()); err != nil {
                return fmt.Errorf("failed to chmod %s: %w", path, err)
        }

        timespec := []unix.Timespec{
                unix.NsecToTimespec(syscall.TimespecToNsec(fs.StatAtime(st))),
                unix.NsecToTimespec(syscall.TimespecToNsec(fs.StatMtime(st))),
        }
        if err := unix.UtimesNanoAt(unix.AT_FDCWD, path, timespec, unix.AT_SYMLINK_NOFOLLOW); err != nil {
                return fmt.Errorf("failed to utime %s: %w", path, err)
        }

        return nil
}

func copyUpXAttrs(dst, src string) error {
        xattrKeys, err := sysx.LListxattr(src)
        if err != nil {
                if err == unix.ENOTSUP || err == sysx.ENODATA {
                        return nil
                }
                return fmt.Errorf("failed to list xattrs on %s: %w", src, err)
        }
        for _, xattr := range xattrKeys {
                // Do not copy up trusted attributes
                if strings.HasPrefix(xattr, "trusted.") {
                        continue
                }
                data, err := sysx.LGetxattr(src, xattr)
                if err != nil {
                        if err == unix.ENOTSUP || err == sysx.ENODATA {
                                continue
                        }
                        return fmt.Errorf("failed to get xattr %q on %s: %w", xattr, src, err)
                }
                if err := lsetxattrCreate(dst, xattr, data); err != nil {
                        return fmt.Errorf("failed to set xattr %q on %s: %w", xattr, dst, err)
                }
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
   Portions from https://github.com/moby/moby/blob/v23.0.1/pkg/archive/archive.go#L419-L464
   Copyright (C) Docker/Moby authors.
   Licensed under the Apache License, Version 2.0
   NOTICE: https://github.com/moby/moby/blob/v23.0.1/NOTICE
*/

package tarheader

import (
        "archive/tar"
        "os"
)

// nosysFileInfo hides the system-dependent info of the wrapped FileInfo to
// prevent tar.FileInfoHeader from introspecting it and potentially calling into
// glibc.
//
// From https://github.com/moby/moby/blob/v23.0.1/pkg/archive/archive.go#L419-L434 .
type nosysFileInfo struct {
        os.FileInfo
}

func (fi nosysFileInfo) Sys() interface{} {
        // A Sys value of type *tar.Header is safe as it is system-independent.
        // The tar.FileInfoHeader function copies the fields into the returned
        // header without performing any OS lookups.
        if sys, ok := fi.FileInfo.Sys().(*tar.Header); ok {
                return sys
        }
        return nil
}

// sysStat, if non-nil, populates hdr from system-dependent fields of fi.
//
// From https://github.com/moby/moby/blob/v23.0.1/pkg/archive/archive.go#L436-L437 .
var sysStat func(fi os.FileInfo, hdr *tar.Header) error

// FileInfoHeaderNoLookups creates a partially-populated tar.Header from fi.
//
// Compared to the archive/tar.FileInfoHeader function, this function is safe to
// call from a chrooted process as it does not populate fields which would
// require operating system lookups. It behaves identically to
// tar.FileInfoHeader when fi is a FileInfo value returned from
// tar.Header.FileInfo().
//
// When fi is a FileInfo for a native file, such as returned from os.Stat() and
// os.Lstat(), the returned Header value differs from one returned from
// tar.FileInfoHeader in the following ways. The Uname and Gname fields are not
// set as OS lookups would be required to populate them. The AccessTime and
// ChangeTime fields are not currently set (not yet implemented) although that
// is subject to change. Callers which require the AccessTime or ChangeTime
// fields to be zeroed should explicitly zero them out in the returned Header
// value to avoid any compatibility issues in the future.
//
// From https://github.com/moby/moby/blob/v23.0.1/pkg/archive/archive.go#L439-L464 .
func FileInfoHeaderNoLookups(fi os.FileInfo, link string) (*tar.Header, error) {
        hdr, err := tar.FileInfoHeader(nosysFileInfo{fi}, link)
        if err != nil {
                return nil, err
        }
        if sysStat != nil {
                return hdr, sysStat(fi, hdr)
        }
        return hdr, nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
   Portions from https://github.com/moby/moby/blob/v23.0.1/pkg/archive/archive_unix.go#L52-L70
   Copyright (C) Docker/Moby authors.
   Licensed under the Apache License, Version 2.0
   NOTICE: https://github.com/moby/moby/blob/v23.0.1/NOTICE
*/

package tarheader

import (
        "archive/tar"
        "os"
        "runtime"
        "syscall"

        "golang.org/x/sys/unix"
)

func init() {
        sysStat = statUnix
}

// statUnix populates hdr from system-dependent fields of fi without performing
// any OS lookups.
// From https://github.com/moby/moby/blob/2cccb1f02c83aaacef3c15fa43f3b64d57f315f8/pkg/archive/archive_unix.go#L46-L76
func statUnix(fi os.FileInfo, hdr *tar.Header) error {
        s, ok := fi.Sys().(*syscall.Stat_t)
        if !ok {
                return nil
        }

        hdr.Uid = int(s.Uid)
        hdr.Gid = int(s.Gid)

        // Devmajor and Devminor are only needed for special devices.

        // In FreeBSD, RDev for regular files is -1 (unless overridden by FS):
        // https://cgit.freebsd.org/src/tree/sys/kern/vfs_default.c?h=stable/13#n1531
        // (NODEV is -1: https://cgit.freebsd.org/src/tree/sys/sys/param.h?h=stable/13#n241).

        // ZFS in particular does not override the default:
        // https://cgit.freebsd.org/src/tree/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c?h=stable/13#n2027

        // Since `Stat_t.Rdev` is uint64, the cast turns -1 into (2^64 - 1).
        // Such large values cannot be encoded in a tar header.
        if runtime.GOOS == "freebsd" && hdr.Typeflag != tar.TypeBlock && hdr.Typeflag != tar.TypeChar {
                return nil
        }

        if s.Mode&unix.S_IFBLK != 0 ||
                s.Mode&unix.S_IFCHR != 0 {
                hdr.Devmajor = int64(unix.Major(uint64(s.Rdev)))
                hdr.Devminor = int64(unix.Minor(uint64(s.Rdev)))
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "syscall"
        "time"
        "unsafe"
)

var (
        minTime = time.Unix(0, 0)
        maxTime time.Time
)

func init() {
        if unsafe.Sizeof(syscall.Timespec{}.Nsec) == 8 {
                // This is a 64 bit timespec
                // os.Chtimes limits time to the following
                maxTime = time.Unix(0, 1<<63-1)
        } else {
                // This is a 32 bit timespec
                maxTime = time.Unix(1<<31-1, 0)
        }
}

func boundTime(t time.Time) time.Time {
        if t.Before(minTime) || t.After(maxTime) {
                return minTime
        }

        return t
}

func latestTime(t1, t2 time.Time) time.Time {
        if t1.Before(t2) {
                return t2
        }
        return t1
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package archive

import (
        "fmt"
        "time"

        "golang.org/x/sys/unix"
)

func chtimes(path string, atime, mtime time.Time) error {
        var utimes [2]unix.Timespec
        utimes[0] = unix.NsecToTimespec(atime.UnixNano())
        utimes[1] = unix.NsecToTimespec(mtime.UnixNano())

        if err := unix.UtimesNanoAt(unix.AT_FDCWD, path, utimes[0:], unix.AT_SYMLINK_NOFOLLOW); err != nil {
                return fmt.Errorf("failed call to UtimesNanoAt for %s: %w", path, err)
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
Package atomicfile provides a mechanism (on Unix-like platforms) to present a consistent view of a file to separate
processes even while the file is being written.  This is accomplished by writing a temporary file, syncing to disk, and
renaming over the destination file name.

Partial/inconsistent reads can occur due to:
 1. A process attempting to read the file while it is being written to (both in the case of a new file with a
    short/incomplete write or in the case of an existing, updated file where new bytes may be written at the beginning
    but old bytes may still be present after).
 2. Concurrent goroutines leading to multiple active writers of the same file.

The above mechanism explicitly protects against (1) as all writes are to a file with a temporary name.

There is no explicit protection against multiple, concurrent goroutines attempting to write the same file. However,
atomically writing the file should mean only one writer will "win" and a consistent file will be visible.

Note: atomicfile is partially implemented for Windows. The Windows codepath performs the same operations, however
Windows does not guarantee that a rename operation is atomic; a crash in the middle may leave the destination file
truncated rather than with the expected content.
*/
package atomicfile

import (
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "sync"
)

// File is an io.ReadWriteCloser that can also be Canceled if a change needs to be abandoned.
type File interface {
        io.ReadWriteCloser
        // Cancel abandons a change to a file. This can be called if a write fails or another error occurs.
        Cancel() error
}

// ErrClosed is returned if Read or Write are called on a closed File.
var ErrClosed = errors.New("file is closed")

// New returns a new atomic file.  On Unix-like platforms, the writer (an io.ReadWriteCloser) is backed by a temporary
// file placed into the same directory as the destination file (using filepath.Dir to split the directory from the
// name).  On a call to Close the temporary file is synced to disk and renamed to its final name, hiding any previous
// file by the same name.
//
// Note: Take care to call Close and handle any errors that are returned.  Errors returned from Close may indicate that
// the file was not written with its final name.
func New(name string, mode os.FileMode) (File, error) {
        return newFile(name, mode)
}

type atomicFile struct {
        name     string
        f        *os.File
        closed   bool
        closedMu sync.RWMutex
}

func newFile(name string, mode os.FileMode) (File, error) {
        dir := filepath.Dir(name)
        f, err := os.CreateTemp(dir, "")
        if err != nil {
                return nil, fmt.Errorf("failed to create temp file: %w", err)
        }
        if err := f.Chmod(mode); err != nil {
                return nil, fmt.Errorf("failed to change temp file permissions: %w", err)
        }
        return &atomicFile{name: name, f: f}, nil
}

func (a *atomicFile) Close() (err error) {
        a.closedMu.Lock()
        defer a.closedMu.Unlock()

        if a.closed {
                return nil
        }
        a.closed = true

        defer func() {
                if err != nil {
                        _ = os.Remove(a.f.Name()) // ignore errors
                }
        }()
        // The order of operations here is:
        // 1. sync
        // 2. close
        // 3. rename
        // While the ordering of 2 and 3 is not important on Unix-like operating systems, Windows cannot rename an open
        // file. By closing first, we allow the rename operation to succeed.
        if err = a.f.Sync(); err != nil {
                return fmt.Errorf("failed to sync temp file %q: %w", a.f.Name(), err)
        }
        if err = a.f.Close(); err != nil {
                return fmt.Errorf("failed to close temp file %q: %w", a.f.Name(), err)
        }
        if err = os.Rename(a.f.Name(), a.name); err != nil {
                return fmt.Errorf("failed to rename %q to %q: %w", a.f.Name(), a.name, err)
        }
        return nil
}

func (a *atomicFile) Cancel() error {
        a.closedMu.Lock()
        defer a.closedMu.Unlock()

        if a.closed {
                return nil
        }
        a.closed = true
        _ = a.f.Close() // ignore error
        return os.Remove(a.f.Name())
}

func (a *atomicFile) Read(p []byte) (n int, err error) {
        a.closedMu.RLock()
        defer a.closedMu.RUnlock()
        if a.closed {
                return 0, ErrClosed
        }
        return a.f.Read(p)
}

func (a *atomicFile) Write(p []byte) (n int, err error) {
        a.closedMu.RLock()
        defer a.closedMu.RUnlock()
        if a.closed {
                return 0, ErrClosed
        }
        return a.f.Write(p)
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package blockio

import (
        "fmt"
        "sync"

        "github.com/intel/goresctrl/pkg/blockio"
        runtimespec "github.com/opencontainers/runtime-spec/specs-go"

        "github.com/containerd/log"
)

var (
        enabled   bool
        enabledMu sync.RWMutex
)

// IsEnabled checks whether blockio is enabled.
func IsEnabled() bool {
        enabledMu.RLock()
        defer enabledMu.RUnlock()

        return enabled
}

// SetConfig updates blockio config with a given config path.
func SetConfig(configFilePath string) error {
        enabledMu.Lock()
        defer enabledMu.Unlock()

        enabled = false
        if configFilePath == "" {
                log.L.Debug("No blockio config file specified, blockio not configured")
                return nil
        }

        if err := blockio.SetConfigFromFile(configFilePath, true); err != nil {
                return fmt.Errorf("blockio not enabled: %w", err)
        }
        enabled = true
        return nil
}

// ClassNameToLinuxOCI converts blockio class name into the LinuxBlockIO
// structure in the OCI runtime spec.
func ClassNameToLinuxOCI(className string) (*runtimespec.LinuxBlockIO, error) {
        return blockio.OciLinuxBlockIO(className)
}

// ContainerClassFromAnnotations examines container and pod annotations of a
// container and returns its blockio class.
func ContainerClassFromAnnotations(containerName string, containerAnnotations, podAnnotations map[string]string) (string, error) {
        return blockio.ContainerClassFromAnnotations(containerName, containerAnnotations, podAnnotations)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package cap provides Linux capability utility
package cap

import (
        "bufio"
        "fmt"
        "io"
        "os"
        "strconv"
        "strings"
)

// FromNumber returns a cap string like "CAP_SYS_ADMIN"
// that corresponds to the given number like 21.
//
// FromNumber returns an empty string for unknown cap number.
func FromNumber(num int) string {
        if num < 0 || num > len(capsLatest)-1 {
                return ""
        }
        return capsLatest[num]
}

// FromBitmap parses an uint64 bitmap into string slice like
// []{"CAP_SYS_ADMIN", ...}.
//
// Unknown cap numbers are returned as []int.
func FromBitmap(v uint64) ([]string, []int) {
        var (
                res     []string
                unknown []int
        )
        for i := 0; i <= 63; i++ {
                if b := (v >> i) & 0x1; b == 0x1 {
                        if s := FromNumber(i); s != "" {
                                res = append(res, s)
                        } else {
                                unknown = append(unknown, i)
                        }
                }
        }
        return res, unknown
}

// Type is the type of capability
type Type int

const (
        // Effective is CapEff
        Effective Type = 1 << iota
        // Permitted is CapPrm
        Permitted
        // Inheritable is CapInh
        Inheritable
        // Bounding is CapBnd
        Bounding
        // Ambient is CapAmb
        Ambient
)

// ParseProcPIDStatus returns uint64 bitmap value from /proc/<PID>/status file
func ParseProcPIDStatus(r io.Reader) (map[Type]uint64, error) {
        res := make(map[Type]uint64)
        scanner := bufio.NewScanner(r)
        for scanner.Scan() {
                line := scanner.Text()
                k, v, ok := strings.Cut(line, ":")
                if !ok {
                        continue
                }
                k = strings.TrimSpace(k)
                switch k {
                case "CapInh", "CapPrm", "CapEff", "CapBnd", "CapAmb":
                        ui64, err := strconv.ParseUint(strings.TrimSpace(v), 16, 64)
                        if err != nil {
                                return nil, fmt.Errorf("failed to parse line %q", line)
                        }
                        switch k {
                        case "CapInh":
                                res[Inheritable] = ui64
                        case "CapPrm":
                                res[Permitted] = ui64
                        case "CapEff":
                                res[Effective] = ui64
                        case "CapBnd":
                                res[Bounding] = ui64
                        case "CapAmb":
                                res[Ambient] = ui64
                        }
                }
        }
        if err := scanner.Err(); err != nil {
                return nil, err
        }
        return res, nil
}

// Current returns the list of the effective and the known caps of
// the current process.
//
// The result is like []string{"CAP_SYS_ADMIN", ...}.
func Current() ([]string, error) {
        f, err := os.Open("/proc/self/status")
        if err != nil {
                return nil, err
        }
        defer f.Close()
        caps, err := ParseProcPIDStatus(f)
        if err != nil {
                return nil, err
        }
        capEff := caps[Effective]
        names, _ := FromBitmap(capEff)
        return names, nil
}

var (
        // caps35 is the caps of kernel 3.5 (37 entries)
        caps35 = []string{
                "CAP_CHOWN",            // 2.2
                "CAP_DAC_OVERRIDE",     // 2.2
                "CAP_DAC_READ_SEARCH",  // 2.2
                "CAP_FOWNER",           // 2.2
                "CAP_FSETID",           // 2.2
                "CAP_KILL",             // 2.2
                "CAP_SETGID",           // 2.2
                "CAP_SETUID",           // 2.2
                "CAP_SETPCAP",          // 2.2
                "CAP_LINUX_IMMUTABLE",  // 2.2
                "CAP_NET_BIND_SERVICE", // 2.2
                "CAP_NET_BROADCAST",    // 2.2
                "CAP_NET_ADMIN",        // 2.2
                "CAP_NET_RAW",          // 2.2
                "CAP_IPC_LOCK",         // 2.2
                "CAP_IPC_OWNER",        // 2.2
                "CAP_SYS_MODULE",       // 2.2
                "CAP_SYS_RAWIO",        // 2.2
                "CAP_SYS_CHROOT",       // 2.2
                "CAP_SYS_PTRACE",       // 2.2
                "CAP_SYS_PACCT",        // 2.2
                "CAP_SYS_ADMIN",        // 2.2
                "CAP_SYS_BOOT",         // 2.2
                "CAP_SYS_NICE",         // 2.2
                "CAP_SYS_RESOURCE",     // 2.2
                "CAP_SYS_TIME",         // 2.2
                "CAP_SYS_TTY_CONFIG",   // 2.2
                "CAP_MKNOD",            // 2.4
                "CAP_LEASE",            // 2.4
                "CAP_AUDIT_WRITE",      // 2.6.11
                "CAP_AUDIT_CONTROL",    // 2.6.11
                "CAP_SETFCAP",          // 2.6.24
                "CAP_MAC_OVERRIDE",     // 2.6.25
                "CAP_MAC_ADMIN",        // 2.6.25
                "CAP_SYSLOG",           // 2.6.37
                "CAP_WAKE_ALARM",       // 3.0
                "CAP_BLOCK_SUSPEND",    // 3.5
        }
        // caps316 is the caps of kernel 3.16 (38 entries)
        caps316 = append(caps35, "CAP_AUDIT_READ")
        // caps58 is the caps of kernel 5.8 (40 entries)
        caps58 = append(caps316, []string{"CAP_PERFMON", "CAP_BPF"}...)
        // caps59 is the caps of kernel 5.9 (41 entries)
        caps59     = append(caps58, "CAP_CHECKPOINT_RESTORE")
        capsLatest = caps59
)

// Known returns the known cap strings of the latest kernel.
// The current latest kernel is 5.9.
func Known() []string {
        return capsLatest
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package cio

import (
        "context"
        "fmt"
        "io"
        "net/url"
        "os"
        "path/filepath"
        "strings"
        "sync"

        "github.com/containerd/containerd/v2/defaults"
)

var bufPool = sync.Pool{
        New: func() interface{} {
                buffer := make([]byte, 32<<10)
                return &buffer
        },
}

// Config holds the IO configurations.
type Config struct {
        // Terminal is true if one has been allocated
        Terminal bool
        // Stdin path
        Stdin string
        // Stdout path
        Stdout string
        // Stderr path
        Stderr string
}

// IO holds the io information for a task or process
type IO interface {
        // Config returns the IO configuration.
        Config() Config
        // Cancel aborts all current io operations.
        Cancel()
        // Wait blocks until all io copy operations have completed.
        Wait()
        // Close cleans up all open io resources. Cancel() is always called before
        // Close()
        Close() error
}

// Creator creates new IO sets for a task
type Creator func(id string) (IO, error)

// Attach allows callers to reattach to running tasks
//
// There should only be one reader for a task's IO set
// because fifo's can only be read from one reader or the output
// will be sent only to the first reads
type Attach func(*FIFOSet) (IO, error)

// FIFOSet is a set of file paths to FIFOs for a task's standard IO streams,
// Although it supports streaming io other than FIFOs,
// we do not change the name "FIFOSet" because it is referenced in too many codes.
type FIFOSet struct {
        Config
        close func() error
}

// Close the FIFOSet
func (f *FIFOSet) Close() error {
        if f != nil && f.close != nil {
                return f.close()
        }
        return nil
}

// NewFIFOSet returns a new FIFOSet from a Config and a close function
func NewFIFOSet(config Config, close func() error) *FIFOSet {
        return &FIFOSet{Config: config, close: close}
}

// Streams used to configure a Creator or Attach
type Streams struct {
        Stdin    io.Reader
        Stdout   io.Writer
        Stderr   io.Writer
        Terminal bool
        FIFODir  string
}

// Opt customize options for creating a Creator or Attach
type Opt func(*Streams)

// WithStdio sets stream options to the standard input/output streams
func WithStdio(opt *Streams) {
        WithStreams(os.Stdin, os.Stdout, os.Stderr)(opt)
}

// WithTerminal sets the terminal option
func WithTerminal(opt *Streams) {
        opt.Terminal = true
}

// WithStreams sets the stream options to the specified Reader and Writers
func WithStreams(stdin io.Reader, stdout, stderr io.Writer) Opt {
        return func(opt *Streams) {
                opt.Stdin = stdin
                opt.Stdout = stdout
                opt.Stderr = stderr
        }
}

// WithFIFODir sets the fifo directory.
// e.g. "/run/containerd/fifo", "/run/users/1001/containerd/fifo"
func WithFIFODir(dir string) Opt {
        return func(opt *Streams) {
                opt.FIFODir = dir
        }
}

// NewCreator returns an IO creator from the options
func NewCreator(opts ...Opt) Creator {
        streams := &Streams{}
        for _, opt := range opts {
                opt(streams)
        }
        if streams.FIFODir == "" {
                streams.FIFODir = defaults.DefaultFIFODir
        }
        return func(id string) (IO, error) {
                fifos, err := NewFIFOSetInDir(streams.FIFODir, id, streams.Terminal)
                if err != nil {
                        return nil, err
                }
                if streams.Stdin == nil {
                        fifos.Stdin = ""
                }
                if streams.Stdout == nil {
                        fifos.Stdout = ""
                }
                if streams.Stderr == nil {
                        fifos.Stderr = ""
                }
                return copyIO(fifos, streams)
        }
}

// NewAttach attaches the existing io for a task to the provided io.Reader/Writers
func NewAttach(opts ...Opt) Attach {
        streams := &Streams{}
        for _, opt := range opts {
                opt(streams)
        }
        return func(fifos *FIFOSet) (IO, error) {
                if fifos == nil {
                        return nil, fmt.Errorf("cannot attach, missing fifos")
                }
                if streams.Stdin == nil {
                        fifos.Stdin = ""
                }
                if streams.Stdout == nil {
                        fifos.Stdout = ""
                }
                if streams.Stderr == nil {
                        fifos.Stderr = ""
                }
                return copyIO(fifos, streams)
        }
}

// NullIO redirects the container's IO into /dev/null
func NullIO(_ string) (IO, error) {
        return &cio{}, nil
}

// cio is a basic container IO implementation.
type cio struct {
        config  Config
        wg      *sync.WaitGroup
        closers []io.Closer
        cancel  context.CancelFunc
}

func (c *cio) Config() Config {
        return c.config
}

func (c *cio) Wait() {
        if c.wg != nil {
                c.wg.Wait()
        }
}

func (c *cio) Close() error {
        var lastErr error
        for _, closer := range c.closers {
                if closer == nil {
                        continue
                }
                if err := closer.Close(); err != nil {
                        lastErr = err
                }
        }
        return lastErr
}

func (c *cio) Cancel() {
        if c.cancel != nil {
                c.cancel()
        }
}

type pipes struct {
        Stdin  io.WriteCloser
        Stdout io.ReadCloser
        Stderr io.ReadCloser
}

// DirectIO allows task IO to be handled externally by the caller
type DirectIO struct {
        pipes
        cio
}

var (
        _ IO = &DirectIO{}
        _ IO = &logURI{}
)

// LogURI provides the raw logging URI
func LogURI(uri *url.URL) Creator {
        return func(_ string) (IO, error) {
                return &logURI{
                        config: Config{
                                Stdout: uri.String(),
                                Stderr: uri.String(),
                        },
                }, nil
        }
}

// TerminalLogURI provides the raw logging URI
// as well as sets the terminal option to true.
func TerminalLogURI(uri *url.URL) Creator {
        return func(_ string) (IO, error) {
                return &logURI{
                        config: Config{
                                Stdout:   uri.String(),
                                Stderr:   uri.String(),
                                Terminal: true,
                        },
                }, nil
        }
}

// BinaryIO forwards container STDOUT|STDERR directly to a logging binary
func BinaryIO(binary string, args map[string]string) Creator {
        return func(_ string) (IO, error) {
                uri, err := LogURIGenerator("binary", binary, args)
                if err != nil {
                        return nil, err
                }

                res := uri.String()
                return &logURI{
                        config: Config{
                                Stdout: res,
                                Stderr: res,
                        },
                }, nil
        }
}

// TerminalBinaryIO forwards container STDOUT|STDERR directly to a logging binary
// It also sets the terminal option to true
func TerminalBinaryIO(binary string, args map[string]string) Creator {
        return func(_ string) (IO, error) {
                uri, err := LogURIGenerator("binary", binary, args)
                if err != nil {
                        return nil, err
                }

                res := uri.String()
                return &logURI{
                        config: Config{
                                Stdout:   res,
                                Stderr:   res,
                                Terminal: true,
                        },
                }, nil
        }
}

// LogFile creates a file on disk that logs the task's STDOUT,STDERR.
// If the log file already exists, the logs will be appended to the file.
func LogFile(path string) Creator {
        return func(_ string) (IO, error) {
                uri, err := LogURIGenerator("file", path, nil)
                if err != nil {
                        return nil, err
                }

                res := uri.String()
                return &logURI{
                        config: Config{
                                Stdout: res,
                                Stderr: res,
                        },
                }, nil
        }
}

// LogURIGenerator is the helper to generate log uri with specific scheme.
func LogURIGenerator(scheme string, path string, args map[string]string) (*url.URL, error) {
        path = filepath.Clean(path)
        if !filepath.IsAbs(path) {
                return nil, fmt.Errorf("%q must be absolute", path)
        }

        // Without adding / here, C:\foo\bar.txt will become file://C:/foo/bar.txt
        // which is invalid. The path must have three slashes.
        //
        // https://learn.microsoft.com/en-us/archive/blogs/ie/file-uris-in-windows
        // > In the case of a local Windows file path, there is no hostname,
        // > and thus another slash and the path immediately follow.
        p := filepath.ToSlash(path)
        if !strings.HasPrefix(path, "/") {
                p = "/" + p
        }
        uri := &url.URL{Scheme: scheme, Path: p}

        if len(args) == 0 {
                return uri, nil
        }

        q := uri.Query()
        for k, v := range args {
                q.Set(k, v)
        }
        uri.RawQuery = q.Encode()
        return uri, nil
}

type logURI struct {
        config Config
}

func (l *logURI) Config() Config {
        return l.config
}

func (l *logURI) Cancel() {

}

func (l *logURI) Wait() {

}

func (l *logURI) Close() error {
        return nil
}

// Load the io for a container but do not attach
//
// Allows io to be loaded on the task for deletion without
// starting copy routines
func Load(set *FIFOSet) (IO, error) {
        return &cio{
                config:  set.Config,
                closers: []io.Closer{set},
        }, nil
}

func (p *pipes) closers() []io.Closer {
        return []io.Closer{p.Stdin, p.Stdout, p.Stderr}
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package cio

import (
        "context"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "sync"
        "syscall"

        "github.com/containerd/fifo"
)

// NewFIFOSetInDir returns a new FIFOSet with paths in a temporary directory under root
func NewFIFOSetInDir(root, id string, terminal bool) (*FIFOSet, error) {
        if root != "" {
                if err := os.MkdirAll(root, 0700); err != nil {
                        return nil, err
                }
        }
        dir, err := os.MkdirTemp(root, "")
        if err != nil {
                return nil, err
        }
        closer := func() error {
                return os.RemoveAll(dir)
        }
        return NewFIFOSet(Config{
                Stdin:    filepath.Join(dir, id+"-stdin"),
                Stdout:   filepath.Join(dir, id+"-stdout"),
                Stderr:   filepath.Join(dir, id+"-stderr"),
                Terminal: terminal,
        }, closer), nil
}

func copyIO(fifos *FIFOSet, ioset *Streams) (*cio, error) {
        var ctx, cancel = context.WithCancel(context.Background())
        pipes, err := openFifos(ctx, fifos)
        if err != nil {
                cancel()
                return nil, err
        }

        if fifos.Stdin != "" {
                go func() {
                        p := bufPool.Get().(*[]byte)
                        defer bufPool.Put(p)

                        io.CopyBuffer(pipes.Stdin, ioset.Stdin, *p)
                        pipes.Stdin.Close()
                }()
        }

        var wg = &sync.WaitGroup{}
        if fifos.Stdout != "" {
                wg.Add(1)
                go func() {
                        p := bufPool.Get().(*[]byte)
                        defer bufPool.Put(p)

                        io.CopyBuffer(ioset.Stdout, pipes.Stdout, *p)
                        pipes.Stdout.Close()
                        wg.Done()
                }()
        }

        if !fifos.Terminal && fifos.Stderr != "" {
                wg.Add(1)
                go func() {
                        p := bufPool.Get().(*[]byte)
                        defer bufPool.Put(p)

                        io.CopyBuffer(ioset.Stderr, pipes.Stderr, *p)
                        pipes.Stderr.Close()
                        wg.Done()
                }()
        }
        return &cio{
                config:  fifos.Config,
                wg:      wg,
                closers: append(pipes.closers(), fifos),
                cancel: func() {
                        cancel()
                        for _, c := range pipes.closers() {
                                if c != nil {
                                        c.Close()
                                }
                        }
                },
        }, nil
}

func openFifos(ctx context.Context, fifos *FIFOSet) (f pipes, retErr error) {
        defer func() {
                if retErr != nil {
                        fifos.Close()
                }
        }()

        if fifos.Stdin != "" {
                if f.Stdin, retErr = fifo.OpenFifo(ctx, fifos.Stdin, syscall.O_WRONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); retErr != nil {
                        return f, fmt.Errorf("failed to open stdin fifo: %w", retErr)
                }
                defer func() {
                        if retErr != nil && f.Stdin != nil {
                                f.Stdin.Close()
                        }
                }()
        }
        if fifos.Stdout != "" {
                if f.Stdout, retErr = fifo.OpenFifo(ctx, fifos.Stdout, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); retErr != nil {
                        return f, fmt.Errorf("failed to open stdout fifo: %w", retErr)
                }
                defer func() {
                        if retErr != nil && f.Stdout != nil {
                                f.Stdout.Close()
                        }
                }()
        }
        if !fifos.Terminal && fifos.Stderr != "" {
                if f.Stderr, retErr = fifo.OpenFifo(ctx, fifos.Stderr, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); retErr != nil {
                        return f, fmt.Errorf("failed to open stderr fifo: %w", retErr)
                }
        }
        return f, nil
}

// NewDirectIO returns an IO implementation that exposes the IO streams as io.ReadCloser
// and io.WriteCloser.
func NewDirectIO(ctx context.Context, fifos *FIFOSet) (*DirectIO, error) {
        ctx, cancel := context.WithCancel(ctx)
        pipes, err := openFifos(ctx, fifos)
        return &DirectIO{
                pipes: pipes,
                cio: cio{
                        config:  fifos.Config,
                        closers: append(pipes.closers(), fifos),
                        cancel:  cancel,
                },
        }, err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package deprecation

type Warning string

const (
        // Prefix is a standard prefix for all Warnings, used for filtering plugin Exports
        Prefix = "io.containerd.deprecation/"
        // PullSchema1Image is a warning for the use of schema 1 images
        PullSchema1Image Warning = Prefix + "pull-schema-1-image"
        // GoPluginLibrary is a warning for the use of dynamic library Go plugins
        GoPluginLibrary Warning = Prefix + "go-plugin-library"
        // CRIRegistryMirrors is a warning for the use of the `mirrors` property
        CRIRegistryMirrors Warning = Prefix + "cri-registry-mirrors"
        // CRIRegistryAuths is a warning for the use of the `auths` property
        CRIRegistryAuths Warning = Prefix + "cri-registry-auths"
        // CRIRegistryConfigs is a warning for the use of the `configs` property
        CRIRegistryConfigs Warning = Prefix + "cri-registry-configs"
        // OTLPTracingConfig is a warning for the use of the `otlp` property
        TracingOTLPConfig Warning = Prefix + "tracing-processor-config"
        // TracingServiceConfig is a warning for the use of the `tracing` property
        TracingServiceConfig Warning = Prefix + "tracing-service-config"
)

const (
        EnvPrefix           = "CONTAINERD_ENABLE_DEPRECATED_"
        EnvPullSchema1Image = EnvPrefix + "PULL_SCHEMA_1_IMAGE"
)

var messages = map[Warning]string{
        PullSchema1Image: "Schema 1 images are deprecated since containerd v1.7, disabled in containerd v2.0, and will be removed in containerd v2.1. " +
                `Since containerd v1.7.8, schema 1 images are identified by the "io.containerd.image/converted-docker-schema1" label.`,
        GoPluginLibrary: "Dynamically-linked Go plugins as containerd runtimes are deprecated since containerd v2.0 and removed in containerd v2.1.",
        CRIRegistryMirrors: "The `mirrors` property of `[plugins.\"io.containerd.grpc.v1.cri\".registry]` is deprecated since containerd v1.5 and will be removed in containerd v2.0." +
                "Use `config_path` instead.",
        CRIRegistryAuths: "The `auths` property of `[plugins.\"io.containerd.grpc.v1.cri\".registry]` is deprecated since containerd v1.3 and will be removed in containerd v2.0." +
                "Use `ImagePullSecrets` instead.",
        CRIRegistryConfigs: "The `configs` property of `[plugins.\"io.containerd.grpc.v1.cri\".registry]` is deprecated since containerd v1.5 and will be removed in containerd v2.0." +
                "Use `config_path` instead.",

        TracingOTLPConfig: "The `otlp` property of `[plugins.\"io.containerd.tracing.processor.v1\".otlp]` is deprecated since containerd v1.6 and will be removed in containerd v2.0." +
                "Use OTLP environment variables instead: https://opentelemetry.io/docs/specs/otel/protocol/exporter/",
        TracingServiceConfig: "The `tracing` property of `[plugins.\"io.containerd.internal.v1\".tracing]` is deprecated since containerd v1.6 and will be removed in containerd v2.0." +
                "Use OTEL environment variables instead: https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/",
}

// Valid checks whether a given Warning is valid
func Valid(id Warning) bool {
        _, ok := messages[id]
        return ok
}

// Message returns the human-readable message for a given Warning
func Message(id Warning) (string, bool) {
        msg, ok := messages[id]
        return msg, ok
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package dialer

import (
        "context"
        "fmt"
        "net"
        "time"
)

type dialResult struct {
        c   net.Conn
        err error
}

// ContextDialer returns a GRPC net.Conn connected to the provided address
func ContextDialer(ctx context.Context, address string) (net.Conn, error) {
        if deadline, ok := ctx.Deadline(); ok {
                return timeoutDialer(address, time.Until(deadline))
        }
        return timeoutDialer(address, 0)
}

func timeoutDialer(address string, timeout time.Duration) (net.Conn, error) {
        var (
                stopC = make(chan struct{})
                synC  = make(chan *dialResult)
        )
        go func() {
                defer close(synC)
                for {
                        select {
                        case <-stopC:
                                return
                        default:
                                c, err := dialer(address, timeout)
                                if isNoent(err) {
                                        <-time.After(10 * time.Millisecond)
                                        continue
                                }
                                synC <- &dialResult{c, err}
                                return
                        }
                }
        }()
        select {
        case dr := <-synC:
                return dr.c, dr.err
        case <-time.After(timeout):
                close(stopC)
                go func() {
                        dr := <-synC
                        if dr != nil && dr.c != nil {
                                dr.c.Close()
                        }
                }()
                return nil, fmt.Errorf("dial %s: timeout", address)
        }
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package dialer

import (
        "errors"
        "fmt"
        "net"
        "strings"
        "syscall"
        "time"
)

// DialAddress returns the address with unix:// prepended to the
// provided address
func DialAddress(address string) string {
        return fmt.Sprintf("unix://%s", address)
}

func isNoent(err error) bool {
        return errors.Is(err, syscall.ENOENT)
}

func dialer(address string, timeout time.Duration) (net.Conn, error) {
        address = strings.TrimPrefix(address, "unix://")
        return net.DialTimeout("unix", address, timeout)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package epoch

import (
        "context"
        "time"
)

type (
        epochKey struct{}
)

// WithSourceDateEpoch associates the context with the epoch.
func WithSourceDateEpoch(ctx context.Context, tm *time.Time) context.Context {
        return context.WithValue(ctx, epochKey{}, tm)
}

// FromContext returns the epoch associated with the context.
// FromContext does not fall back to read the SOURCE_DATE_EPOCH env var.
func FromContext(ctx context.Context) *time.Time {
        v := ctx.Value(epochKey{})
        if v == nil {
                return nil
        }
        return v.(*time.Time)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package epoch provides SOURCE_DATE_EPOCH utilities.
package epoch

import (
        "fmt"
        "os"
        "strconv"
        "time"
)

// SourceDateEpochEnv is the SOURCE_DATE_EPOCH env var.
// See https://reproducible-builds.org/docs/source-date-epoch/
const SourceDateEpochEnv = "SOURCE_DATE_EPOCH"

// SourceDateEpoch returns the SOURCE_DATE_EPOCH env var as *time.Time.
// If the env var is not set, SourceDateEpoch returns nil without an error.
func SourceDateEpoch() (*time.Time, error) {
        v, ok := os.LookupEnv(SourceDateEpochEnv)
        if !ok || v == "" {
                return nil, nil // not an error
        }
        t, err := ParseSourceDateEpoch(v)
        if err != nil {
                return nil, fmt.Errorf("invalid %s value: %w", SourceDateEpochEnv, err)
        }
        return t, nil
}

// ParseSourceDateEpoch parses the given source date epoch, as *time.Time.
// It returns an error if sourceDateEpoch is empty or not well-formatted.
func ParseSourceDateEpoch(sourceDateEpoch string) (*time.Time, error) {
        if sourceDateEpoch == "" {
                return nil, fmt.Errorf("value is empty")
        }
        i64, err := strconv.ParseInt(sourceDateEpoch, 10, 64)
        if err != nil {
                return nil, fmt.Errorf("invalid value: %w", err)
        }
        unix := time.Unix(i64, 0).UTC()
        return &unix, nil
}

// SetSourceDateEpoch sets the SOURCE_DATE_EPOCH env var.
func SetSourceDateEpoch(tm time.Time) {
        _ = os.Setenv(SourceDateEpochEnv, fmt.Sprintf("%d", tm.Unix()))
}

// UnsetSourceDateEpoch unsets the SOURCE_DATE_EPOCH env var.
func UnsetSourceDateEpoch() {
        _ = os.Unsetenv(SourceDateEpochEnv)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package filters

// Adaptor specifies the mapping of fieldpaths to a type. For the given field
// path, the value and whether it is present should be returned. The mapping of
// the fieldpath to a field is deferred to the adaptor implementation, but
// should generally follow protobuf field path/mask semantics.
type Adaptor interface {
        Field(fieldpath []string) (value string, present bool)
}

// AdapterFunc allows implementation specific matching of fieldpaths
type AdapterFunc func(fieldpath []string) (string, bool)

// Field returns the field name and true if it exists
func (fn AdapterFunc) Field(fieldpath []string) (string, bool) {
        return fn(fieldpath)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package filters defines a syntax and parser that can be used for the
// filtration of items across the containerd API. The core is built on the
// concept of protobuf field paths, with quoting.  Several operators allow the
// user to flexibly select items based on field presence, equality, inequality
// and regular expressions. Flexible adaptors support working with any type.
//
// The syntax is fairly familiar, if you've used container ecosystem
// projects.  At the core, we base it on the concept of protobuf field
// paths, augmenting with the ability to quote portions of the field path
// to match arbitrary labels. These "selectors" come in the following
// syntax:
//
// ```
// <fieldpath>[<operator><value>]
// ```
//
// A basic example is as follows:
//
// ```
// name==foo
// ```
//
// This would match all objects that have a field `name` with the value
// `foo`. If we only want to test if the field is present, we can omit the
// operator. This is most useful for matching labels in containerd. The
// following will match objects that have the field "labels" and have the
// label "foo" defined:
//
// ```
// labels.foo
// ```
//
// We also allow for quoting of parts of the field path to allow matching
// of arbitrary items:
//
// ```
// labels."very complex label"==something
// ```
//
// We also define `!=` and `~=` as operators. The `!=` will match all
// objects that don't match the value for a field and `~=` will compile the
// target value as a regular expression and match the field value against that.
//
// Selectors can be combined using a comma, such that the resulting
// selector will require all selectors are matched for the object to match.
// The following example will match objects that are named `foo` and have
// the label `bar`:
//
// ```
// name==foo,labels.bar
// ```
package filters

import (
        "regexp"

        "github.com/containerd/log"
)

// Filter matches specific resources based the provided filter
type Filter interface {
        Match(adaptor Adaptor) bool
}

// FilterFunc is a function that handles matching with an adaptor
type FilterFunc func(Adaptor) bool

// Match matches the FilterFunc returning true if the object matches the filter
func (fn FilterFunc) Match(adaptor Adaptor) bool {
        return fn(adaptor)
}

// Always is a filter that always returns true for any type of object
var Always FilterFunc = func(adaptor Adaptor) bool {
        return true
}

// Any allows multiple filters to be matched against the object
type Any []Filter

// Match returns true if any of the provided filters are true
func (m Any) Match(adaptor Adaptor) bool {
        for _, m := range m {
                if m.Match(adaptor) {
                        return true
                }
        }

        return false
}

// All allows multiple filters to be matched against the object
type All []Filter

// Match only returns true if all filters match the object
func (m All) Match(adaptor Adaptor) bool {
        for _, m := range m {
                if !m.Match(adaptor) {
                        return false
                }
        }

        return true
}

type operator int

const (
        operatorPresent = iota
        operatorEqual
        operatorNotEqual
        operatorMatches
)

func (op operator) String() string {
        switch op {
        case operatorPresent:
                return "?"
        case operatorEqual:
                return "=="
        case operatorNotEqual:
                return "!="
        case operatorMatches:
                return "~="
        }

        return "unknown"
}

type selector struct {
        fieldpath []string
        operator  operator
        value     string
        re        *regexp.Regexp
}

func (m selector) Match(adaptor Adaptor) bool {
        value, present := adaptor.Field(m.fieldpath)

        switch m.operator {
        case operatorPresent:
                return present
        case operatorEqual:
                return present && value == m.value
        case operatorNotEqual:
                return value != m.value
        case operatorMatches:
                if m.re == nil {
                        r, err := regexp.Compile(m.value)
                        if err != nil {
                                log.L.Errorf("error compiling regexp %q", m.value)
                                return false
                        }

                        m.re = r
                }

                return m.re.MatchString(value)
        default:
                return false
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package filters

import (
        "fmt"
        "io"

        "github.com/containerd/errdefs"
)

/*
Parse the strings into a filter that may be used with an adaptor.

The filter is made up of zero or more selectors.

The format is a comma separated list of expressions, in the form of
`<fieldpath><op><value>`, known as selectors. All selectors must match the
target object for the filter to be true.

We define the operators "==" for equality, "!=" for not equal and "~=" for a
regular expression. If the operator and value are not present, the matcher will
test for the presence of a value, as defined by the target object.

The formal grammar is as follows:

selectors := selector ("," selector)*
selector  := fieldpath (operator value)
fieldpath := field ('.' field)*
field     := quoted | [A-Za-z] [A-Za-z0-9_]+
operator  := "==" | "!=" | "~="
value     := quoted | [^\s,]+
quoted    := <go string syntax>
*/
func Parse(s string) (Filter, error) {
        // special case empty to match all
        if s == "" {
                return Always, nil
        }

        p := parser{input: s}
        return p.parse()
}

// ParseAll parses each filter in ss and returns a filter that will return true
// if any filter matches the expression.
//
// If no filters are provided, the filter will match anything.
func ParseAll(ss ...string) (Filter, error) {
        if len(ss) == 0 {
                return Always, nil
        }

        var fs []Filter
        for _, s := range ss {
                f, err := Parse(s)
                if err != nil {
                        return nil, fmt.Errorf("%s: %w", err.Error(), errdefs.ErrInvalidArgument)
                }

                fs = append(fs, f)
        }

        return Any(fs), nil
}

type parser struct {
        input   string
        scanner scanner
}

func (p *parser) parse() (Filter, error) {
        p.scanner.init(p.input)

        ss, err := p.selectors()
        if err != nil {
                return nil, fmt.Errorf("filters: %w", err)
        }

        return ss, nil
}

func (p *parser) selectors() (Filter, error) {
        s, err := p.selector()
        if err != nil {
                return nil, err
        }

        ss := All{s}

loop:
        for {
                tok := p.scanner.peek()
                switch tok {
                case ',':
                        pos, tok, _ := p.scanner.scan()
                        if tok != tokenSeparator {
                                return nil, p.mkerr(pos, "expected a separator")
                        }

                        s, err := p.selector()
                        if err != nil {
                                return nil, err
                        }

                        ss = append(ss, s)
                case tokenEOF:
                        break loop
                default:
                        return nil, p.mkerr(p.scanner.ppos, "unexpected input: %v", string(tok))
                }
        }

        return ss, nil
}

func (p *parser) selector() (selector, error) {
        fieldpath, err := p.fieldpath()
        if err != nil {
                return selector{}, err
        }

        switch p.scanner.peek() {
        case ',', tokenSeparator, tokenEOF:
                return selector{
                        fieldpath: fieldpath,
                        operator:  operatorPresent,
                }, nil
        }

        op, err := p.operator()
        if err != nil {
                return selector{}, err
        }

        var allowAltQuotes bool
        if op == operatorMatches {
                allowAltQuotes = true
        }

        value, err := p.value(allowAltQuotes)
        if err != nil {
                if err == io.EOF {
                        return selector{}, io.ErrUnexpectedEOF
                }
                return selector{}, err
        }

        return selector{
                fieldpath: fieldpath,
                value:     value,
                operator:  op,
        }, nil
}

func (p *parser) fieldpath() ([]string, error) {
        f, err := p.field()
        if err != nil {
                return nil, err
        }

        fs := []string{f}
loop:
        for {
                tok := p.scanner.peek() // lookahead to consume field separator

                switch tok {
                case '.':
                        pos, tok, _ := p.scanner.scan() // consume separator
                        if tok != tokenSeparator {
                                return nil, p.mkerr(pos, "expected a field separator (`.`)")
                        }

                        f, err := p.field()
                        if err != nil {
                                return nil, err
                        }

                        fs = append(fs, f)
                default:
                        // let the layer above handle the other bad cases.
                        break loop
                }
        }

        return fs, nil
}

func (p *parser) field() (string, error) {
        pos, tok, s := p.scanner.scan()
        switch tok {
        case tokenField:
                return s, nil
        case tokenQuoted:
                return p.unquote(pos, s, false)
        case tokenIllegal:
                return "", p.mkerr(pos, p.scanner.err)
        }

        return "", p.mkerr(pos, "expected field or quoted")
}

func (p *parser) operator() (operator, error) {
        pos, tok, s := p.scanner.scan()
        switch tok {
        case tokenOperator:
                switch s {
                case "==":
                        return operatorEqual, nil
                case "!=":
                        return operatorNotEqual, nil
                case "~=":
                        return operatorMatches, nil
                default:
                        return 0, p.mkerr(pos, "unsupported operator %q", s)
                }
        case tokenIllegal:
                return 0, p.mkerr(pos, p.scanner.err)
        }

        return 0, p.mkerr(pos, `expected an operator ("=="|"!="|"~=")`)
}

func (p *parser) value(allowAltQuotes bool) (string, error) {
        pos, tok, s := p.scanner.scan()

        switch tok {
        case tokenValue, tokenField:
                return s, nil
        case tokenQuoted:
                return p.unquote(pos, s, allowAltQuotes)
        case tokenIllegal:
                return "", p.mkerr(pos, p.scanner.err)
        }

        return "", p.mkerr(pos, "expected value or quoted")
}

func (p *parser) unquote(pos int, s string, allowAlts bool) (string, error) {
        if !allowAlts && s[0] != '\'' && s[0] != '"' {
                return "", p.mkerr(pos, "invalid quote encountered")
        }

        uq, err := unquote(s)
        if err != nil {
                return "", p.mkerr(pos, "unquoting failed: %v", err)
        }

        return uq, nil
}

type parseError struct {
        input string
        pos   int
        msg   string
}

func (pe parseError) Error() string {
        if pe.pos < len(pe.input) {
                before := pe.input[:pe.pos]
                location := pe.input[pe.pos : pe.pos+1] // need to handle end
                after := pe.input[pe.pos+1:]

                return fmt.Sprintf("[%s >|%s|< %s]: %v", before, location, after, pe.msg)
        }

        return fmt.Sprintf("[%s]: %v", pe.input, pe.msg)
}

func (p *parser) mkerr(pos int, format string, args ...interface{}) error {
        return fmt.Errorf("parse error: %w", parseError{
                input: p.input,
                pos:   pos,
                msg:   fmt.Sprintf(format, args...),
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package filters

import (
        "errors"
        "unicode/utf8"
)

// NOTE(stevvooe): Most of this code in this file is copied from the stdlib
// strconv package and modified to be able to handle quoting with `/` and `|`
// as delimiters.  The copyright is held by the Go authors.

var errQuoteSyntax = errors.New("quote syntax error")

// UnquoteChar decodes the first character or byte in the escaped string
// or character literal represented by the string s.
// It returns four values:
//
//  1. value, the decoded Unicode code point or byte value;
//  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
//  3. tail, the remainder of the string after the character; and
//  4. an error that will be nil if the character is syntactically valid.
//
// The second argument, quote, specifies the type of literal being parsed
// and therefore which escaped quote character is permitted.
// If set to a single quote, it permits the sequence \' and disallows unescaped '.
// If set to a double quote, it permits \" and disallows unescaped ".
// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
//
// This is from Go strconv package, modified to support `|` and `/` as double
// quotes for use with regular expressions.
func unquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
        // easy cases
        switch c := s[0]; {
        case c == quote && (quote == '\'' || quote == '"' || quote == '/' || quote == '|'):
                err = errQuoteSyntax
                return
        case c >= utf8.RuneSelf:
                r, size := utf8.DecodeRuneInString(s)
                return r, true, s[size:], nil
        case c != '\\':
                return rune(s[0]), false, s[1:], nil
        }

        // hard case: c is backslash
        if len(s) <= 1 {
                err = errQuoteSyntax
                return
        }
        c := s[1]
        s = s[2:]

        switch c {
        case 'a':
                value = '\a'
        case 'b':
                value = '\b'
        case 'f':
                value = '\f'
        case 'n':
                value = '\n'
        case 'r':
                value = '\r'
        case 't':
                value = '\t'
        case 'v':
                value = '\v'
        case 'x', 'u', 'U':
                n := 0
                switch c {
                case 'x':
                        n = 2
                case 'u':
                        n = 4
                case 'U':
                        n = 8
                }
                var v rune
                if len(s) < n {
                        err = errQuoteSyntax
                        return
                }
                for j := 0; j < n; j++ {
                        x, ok := unhex(s[j])
                        if !ok {
                                err = errQuoteSyntax
                                return
                        }
                        v = v<<4 | x
                }
                s = s[n:]
                if c == 'x' {
                        // single-byte string, possibly not UTF-8
                        value = v
                        break
                }
                if v > utf8.MaxRune {
                        err = errQuoteSyntax
                        return
                }
                value = v
                multibyte = true
        case '0', '1', '2', '3', '4', '5', '6', '7':
                v := rune(c) - '0'
                if len(s) < 2 {
                        err = errQuoteSyntax
                        return
                }
                for j := 0; j < 2; j++ { // one digit already; two more
                        x := rune(s[j]) - '0'
                        if x < 0 || x > 7 {
                                err = errQuoteSyntax
                                return
                        }
                        v = (v << 3) | x
                }
                s = s[2:]
                if v > 255 {
                        err = errQuoteSyntax
                        return
                }
                value = v
        case '\\':
                value = '\\'
        case '\'', '"', '|', '/':
                if c != quote {
                        err = errQuoteSyntax
                        return
                }
                value = rune(c)
        default:
                err = errQuoteSyntax
                return
        }
        tail = s
        return
}

// unquote interprets s as a single-quoted, double-quoted,
// or backquoted Go string literal, returning the string value
// that s quotes.  (If s is single-quoted, it would be a Go
// character literal; Unquote returns the corresponding
// one-character string.)
//
// This is modified from the standard library to support `|` and `/` as quote
// characters for use with regular expressions.
func unquote(s string) (string, error) {
        n := len(s)
        if n < 2 {
                return "", errQuoteSyntax
        }
        quote := s[0]
        if quote != s[n-1] {
                return "", errQuoteSyntax
        }
        s = s[1 : n-1]

        if quote == '`' {
                if contains(s, '`') {
                        return "", errQuoteSyntax
                }
                if contains(s, '\r') {
                        // -1 because we know there is at least one \r to remove.
                        buf := make([]byte, 0, len(s)-1)
                        for i := 0; i < len(s); i++ {
                                if s[i] != '\r' {
                                        buf = append(buf, s[i])
                                }
                        }
                        return string(buf), nil
                }
                return s, nil
        }
        if quote != '"' && quote != '\'' && quote != '|' && quote != '/' {
                return "", errQuoteSyntax
        }
        if contains(s, '\n') {
                return "", errQuoteSyntax
        }

        // Is it trivial?  Avoid allocation.
        if !contains(s, '\\') && !contains(s, quote) {
                switch quote {
                case '"', '/', '|': // pipe and slash are treated like double quote
                        return s, nil
                case '\'':
                        r, size := utf8.DecodeRuneInString(s)
                        if size == len(s) && (r != utf8.RuneError || size != 1) {
                                return s, nil
                        }
                }
        }

        var runeTmp [utf8.UTFMax]byte
        buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
        for len(s) > 0 {
                c, multibyte, ss, err := unquoteChar(s, quote)
                if err != nil {
                        return "", err
                }
                s = ss
                if c < utf8.RuneSelf || !multibyte {
                        buf = append(buf, byte(c))
                } else {
                        n := utf8.EncodeRune(runeTmp[:], c)
                        buf = append(buf, runeTmp[:n]...)
                }
                if quote == '\'' && len(s) != 0 {
                        // single-quoted must be single character
                        return "", errQuoteSyntax
                }
        }
        return string(buf), nil
}

// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
        for i := 0; i < len(s); i++ {
                if s[i] == c {
                        return true
                }
        }
        return false
}

func unhex(b byte) (v rune, ok bool) {
        c := rune(b)
        switch {
        case '0' <= c && c <= '9':
                return c - '0', true
        case 'a' <= c && c <= 'f':
                return c - 'a' + 10, true
        case 'A' <= c && c <= 'F':
                return c - 'A' + 10, true
        }
        return
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package filters

import (
        "unicode"
        "unicode/utf8"
)

const (
        tokenEOF = -(iota + 1)
        tokenQuoted
        tokenValue
        tokenField
        tokenSeparator
        tokenOperator
        tokenIllegal
)

type token rune

func (t token) String() string {
        switch t {
        case tokenEOF:
                return "EOF"
        case tokenQuoted:
                return "Quoted"
        case tokenValue:
                return "Value"
        case tokenField:
                return "Field"
        case tokenSeparator:
                return "Separator"
        case tokenOperator:
                return "Operator"
        case tokenIllegal:
                return "Illegal"
        }

        return string(t)
}

func (t token) GoString() string {
        return "token" + t.String()
}

type scanner struct {
        input string
        pos   int
        ppos  int // bounds the current rune in the string
        value bool
        err   string
}

func (s *scanner) init(input string) {
        s.input = input
        s.pos = 0
        s.ppos = 0
}

func (s *scanner) next() rune {
        if s.pos >= len(s.input) {
                return tokenEOF
        }
        s.pos = s.ppos

        r, w := utf8.DecodeRuneInString(s.input[s.ppos:])
        s.ppos += w
        if r == utf8.RuneError {
                if w > 0 {
                        s.error("rune error")
                        return tokenIllegal
                }
                return tokenEOF
        }

        if r == 0 {
                s.error("unexpected null")
                return tokenIllegal
        }

        return r
}

func (s *scanner) peek() rune {
        pos := s.pos
        ppos := s.ppos
        ch := s.next()
        s.pos = pos
        s.ppos = ppos
        return ch
}

func (s *scanner) scan() (nextp int, tk token, text string) {
        var (
                ch  = s.next()
                pos = s.pos
        )

chomp:
        switch {
        case ch == tokenEOF:
        case ch == tokenIllegal:
        case isQuoteRune(ch):
                if !s.scanQuoted(ch) {
                        return pos, tokenIllegal, s.input[pos:s.ppos]
                }
                return pos, tokenQuoted, s.input[pos:s.ppos]
        case isSeparatorRune(ch):
                s.value = false
                return pos, tokenSeparator, s.input[pos:s.ppos]
        case isOperatorRune(ch):
                s.scanOperator()
                s.value = true
                return pos, tokenOperator, s.input[pos:s.ppos]
        case unicode.IsSpace(ch):
                // chomp
                ch = s.next()
                pos = s.pos
                goto chomp
        case s.value:
                s.scanValue()
                s.value = false
                return pos, tokenValue, s.input[pos:s.ppos]
        case isFieldRune(ch):
                s.scanField()
                return pos, tokenField, s.input[pos:s.ppos]
        }

        return s.pos, token(ch), ""
}

func (s *scanner) scanField() {
        for {
                ch := s.peek()
                if !isFieldRune(ch) {
                        break
                }
                s.next()
        }
}

func (s *scanner) scanOperator() {
        for {
                ch := s.peek()
                switch ch {
                case '=', '!', '~':
                        s.next()
                default:
                        return
                }
        }
}

func (s *scanner) scanValue() {
        for {
                ch := s.peek()
                if !isValueRune(ch) {
                        break
                }
                s.next()
        }
}

func (s *scanner) scanQuoted(quote rune) bool {
        var illegal bool
        ch := s.next() // read character after quote
        for ch != quote {
                if ch == '\n' || ch < 0 {
                        s.error("quoted literal not terminated")
                        return false
                }
                if ch == '\\' {
                        var legal bool
                        ch, legal = s.scanEscape(quote)
                        if !legal {
                                illegal = true
                        }
                } else {
                        ch = s.next()
                }
        }
        return !illegal
}

func (s *scanner) scanEscape(quote rune) (ch rune, legal bool) {
        ch = s.next() // read character after '/'
        switch ch {
        case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
                // nothing to do
                ch = s.next()
                legal = true
        case '0', '1', '2', '3', '4', '5', '6', '7':
                ch, legal = s.scanDigits(ch, 8, 3)
        case 'x':
                ch, legal = s.scanDigits(s.next(), 16, 2)
        case 'u':
                ch, legal = s.scanDigits(s.next(), 16, 4)
        case 'U':
                ch, legal = s.scanDigits(s.next(), 16, 8)
        default:
                s.error("illegal escape sequence")
        }
        return
}

func (s *scanner) scanDigits(ch rune, base, n int) (rune, bool) {
        for n > 0 && digitVal(ch) < base {
                ch = s.next()
                n--
        }
        if n > 0 {
                s.error("illegal numeric escape sequence")
                return ch, false
        }
        return ch, true
}

func (s *scanner) error(msg string) {
        if s.err == "" {
                s.err = msg
        }
}

func digitVal(ch rune) int {
        switch {
        case '0' <= ch && ch <= '9':
                return int(ch - '0')
        case 'a' <= ch && ch <= 'f':
                return int(ch - 'a' + 10)
        case 'A' <= ch && ch <= 'F':
                return int(ch - 'A' + 10)
        }
        return 16 // larger than any legal digit val
}

func isFieldRune(r rune) bool {
        return (r == '_' || isAlphaRune(r) || isDigitRune(r))
}

func isAlphaRune(r rune) bool {
        return r >= 'A' && r <= 'Z' || r >= 'a' && r <= 'z'
}

func isDigitRune(r rune) bool {
        return r >= '0' && r <= '9'
}

func isOperatorRune(r rune) bool {
        switch r {
        case '=', '!', '~':
                return true
        }

        return false
}

func isQuoteRune(r rune) bool {
        switch r {
        case '/', '|', '"': // maybe add single quoting?
                return true
        }

        return false
}

func isSeparatorRune(r rune) bool {
        switch r {
        case ',', '.':
                return true
        }

        return false
}

func isValueRune(r rune) bool {
        return r != ',' && !unicode.IsSpace(r) &&
                (unicode.IsLetter(r) ||
                        unicode.IsDigit(r) ||
                        unicode.IsNumber(r) ||
                        unicode.IsGraphic(r) ||
                        unicode.IsPunct(r))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package gc experiments with providing central gc tooling to ensure
// deterministic resource removal within containerd.
//
// For now, we just have a single exported implementation that can be used
// under certain use cases.
package gc

import (
        "context"
        "sync"
        "time"
)

// ResourceType represents type of resource at a node
type ResourceType uint8

// ResourceMax represents the max resource.
// Upper bits are stripped out during the mark phase, allowing the upper 3 bits
// to be used by the caller reference function.
const ResourceMax = ResourceType(0x1F)

// Node presents a resource which has a type and key,
// this node can be used to lookup other nodes.
type Node struct {
        Type      ResourceType
        Namespace string
        Key       string
}

// Stats about a garbage collection run
type Stats interface {
        Elapsed() time.Duration
}

// Tricolor implements basic, single-thread tri-color GC. Given the roots, the
// complete set and a refs function, this function returns a map of all
// reachable objects.
//
// Correct usage requires that the caller not allow the arguments to change
// until the result is used to delete objects in the system.
//
// It will allocate memory proportional to the size of the reachable set.
//
// We can probably use this to inform a design for incremental GC by injecting
// callbacks to the set modification algorithms.
//
// https://en.wikipedia.org/wiki/Tracing_garbage_collection#Tri-color_marking
func Tricolor(roots []Node, refs func(ref Node) ([]Node, error)) (map[Node]struct{}, error) {
        var (
                grays     []Node                // maintain a gray "stack"
                seen      = map[Node]struct{}{} // or not "white", basically "seen"
                reachable = map[Node]struct{}{} // or "black", in tri-color parlance
        )

        grays = append(grays, roots...)

        for len(grays) > 0 {
                // Pick any gray object
                id := grays[len(grays)-1] // effectively "depth first" because first element
                grays = grays[:len(grays)-1]
                seen[id] = struct{}{} // post-mark this as not-white
                rs, err := refs(id)
                if err != nil {
                        return nil, err
                }

                // mark all the referenced objects as gray
                for _, target := range rs {
                        if _, ok := seen[target]; !ok {
                                grays = append(grays, target)
                        }
                }

                // strip bits above max resource type
                id.Type = id.Type & ResourceMax
                // mark as black when done
                reachable[id] = struct{}{}
        }

        return reachable, nil
}

// ConcurrentMark implements simple, concurrent GC. All the roots are scanned
// and the complete set of references is formed by calling the refs function
// for each seen object. This function returns a map of all object reachable
// from a root.
//
// Correct usage requires that the caller not allow the arguments to change
// until the result is used to delete objects in the system.
//
// It will allocate memory proportional to the size of the reachable set.
func ConcurrentMark(ctx context.Context, root <-chan Node, refs func(context.Context, Node, func(Node)) error) (map[Node]struct{}, error) {
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()

        var (
                grays = make(chan Node)
                seen  = map[Node]struct{}{} // or not "white", basically "seen"
                wg    sync.WaitGroup

                errOnce sync.Once
                refErr  error
        )

        go func() {
                for gray := range grays {
                        if _, ok := seen[gray]; ok {
                                wg.Done()
                                continue
                        }
                        seen[gray] = struct{}{} // post-mark this as non-white

                        go func(gray Node) {
                                defer wg.Done()

                                send := func(n Node) {
                                        wg.Add(1)
                                        select {
                                        case grays <- n:
                                        case <-ctx.Done():
                                                wg.Done()
                                        }
                                }

                                if err := refs(ctx, gray, send); err != nil {
                                        errOnce.Do(func() {
                                                refErr = err
                                                cancel()
                                        })
                                }

                        }(gray)
                }
        }()

        for r := range root {
                wg.Add(1)
                select {
                case grays <- r:
                case <-ctx.Done():
                        wg.Done()
                }

        }

        // Wait for outstanding grays to be processed
        wg.Wait()

        close(grays)

        if refErr != nil {
                return nil, refErr
        }
        if cErr := ctx.Err(); cErr != nil {
                return nil, cErr
        }

        return seen, nil
}

// Sweep removes all nodes returned through the slice which are not in
// the reachable set by calling the provided remove function.
func Sweep(reachable map[Node]struct{}, all []Node, remove func(Node) error) error {
        // All black objects are now reachable, and all white objects are
        // unreachable. Free those that are white!
        for _, node := range all {
                if _, ok := reachable[node]; !ok {
                        if err := remove(node); err != nil {
                                return err
                        }
                }
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package identifiers provides common validation for identifiers and keys
// across containerd.
//
// Identifiers in containerd must be a alphanumeric, allowing limited
// underscores, dashes and dots.
//
// While the character set may be expanded in the future, identifiers
// are guaranteed to be safely used as filesystem path components.
package identifiers

import (
        "fmt"
        "regexp"

        "github.com/containerd/errdefs"
)

const (
        maxLength  = 76
        alphanum   = `[A-Za-z0-9]+`
        separators = `[._-]`
)

var (
        // identifierRe defines the pattern for valid identifiers.
        identifierRe = regexp.MustCompile(reAnchor(alphanum + reGroup(separators+reGroup(alphanum)) + "*"))
)

// Validate returns nil if the string s is a valid identifier.
//
// identifiers are similar to the domain name rules according to RFC 1035, section 2.3.1. However
// rules in this package are relaxed to allow numerals to follow period (".") and mixed case is
// allowed.
//
// In general identifiers that pass this validation should be safe for use as filesystem path components.
func Validate(s string) error {
        if len(s) == 0 {
                return fmt.Errorf("identifier must not be empty: %w", errdefs.ErrInvalidArgument)
        }

        if len(s) > maxLength {
                return fmt.Errorf("identifier %q greater than maximum length (%d characters): %w", s, maxLength, errdefs.ErrInvalidArgument)
        }

        if !identifierRe.MatchString(s) {
                return fmt.Errorf("identifier %q must match %v: %w", s, identifierRe, errdefs.ErrInvalidArgument)
        }
        return nil
}

func reGroup(s string) string {
        return `(?:` + s + `)`
}

func reAnchor(s string) string {
        return `^` + s + `$`
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package bindir

import (
        "bufio"
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "os/exec"
        "path/filepath"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/internal/tomlext"
        "github.com/containerd/containerd/v2/pkg/imageverifier"
        "github.com/containerd/log"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

const outputLimitBytes = 1 << 15 // 32 KiB

type Config struct {
        BinDir             string           `toml:"bin_dir"`
        MaxVerifiers       int              `toml:"max_verifiers"`
        PerVerifierTimeout tomlext.Duration `toml:"per_verifier_timeout"`
}

type ImageVerifier struct {
        config *Config
}

var _ imageverifier.ImageVerifier = (*ImageVerifier)(nil)

func NewImageVerifier(c *Config) *ImageVerifier {
        return &ImageVerifier{
                config: c,
        }
}

func (v *ImageVerifier) VerifyImage(ctx context.Context, name string, desc ocispec.Descriptor) (*imageverifier.Judgement, error) {
        // os.ReadDir sorts entries by name.
        entries, err := os.ReadDir(v.config.BinDir)
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        return &imageverifier.Judgement{
                                OK:     true,
                                Reason: fmt.Sprintf("image verifier directory %v does not exist", v.config.BinDir),
                        }, nil
                }

                return nil, fmt.Errorf("failed to list directory contents: %w", err)
        }

        if len(entries) == 0 {
                return &imageverifier.Judgement{
                        OK:     true,
                        Reason: fmt.Sprintf("no image verifier binaries found in %v", v.config.BinDir),
                }, nil
        }

        reason := &strings.Builder{}
        for i, entry := range entries {
                if (i+1) > v.config.MaxVerifiers && v.config.MaxVerifiers >= 0 {
                        log.G(ctx).Warnf("image verifiers are being skipped since directory %v has %v entries, more than configured max of %v verifiers", v.config.BinDir, len(entries), v.config.MaxVerifiers)
                        break
                }

                bin := entry.Name()
                start := time.Now()
                exitCode, vr, err := v.runVerifier(ctx, bin, name, desc)
                runtime := time.Since(start)
                if err != nil {
                        return nil, fmt.Errorf("failed to call verifier %v (runtime %v): %w", bin, runtime, err)
                }

                if exitCode != 0 {
                        return &imageverifier.Judgement{
                                OK:     false,
                                Reason: fmt.Sprintf("verifier %v rejected image (exit code %v): %v", bin, exitCode, vr),
                        }, nil
                }

                if i > 0 {
                        reason.WriteString(", ")
                }
                reason.WriteString(fmt.Sprintf("%v => %v", bin, vr))
        }

        return &imageverifier.Judgement{
                OK:     true,
                Reason: reason.String(),
        }, nil
}

func (v *ImageVerifier) runVerifier(ctx context.Context, bin string, imageName string, desc ocispec.Descriptor) (exitCode int, reason string, err error) {
        ctx, cancel := context.WithTimeout(ctx, tomlext.ToStdTime(v.config.PerVerifierTimeout))
        defer cancel()

        binPath := filepath.Join(v.config.BinDir, bin)
        args := []string{
                "-name", imageName,
                "-digest", desc.Digest.String(),
                "-stdin-media-type", ocispec.MediaTypeDescriptor,
        }

        cmd := exec.CommandContext(ctx, binPath, args...)

        // We construct our own pipes instead of using the default StdinPipe,
        // StoutPipe, and StderrPipe in order to set timeouts on reads and writes.
        stdinRead, stdinWrite, err := os.Pipe()
        if err != nil {
                return -1, "", err
        }
        cmd.Stdin = stdinRead
        defer stdinRead.Close()
        defer stdinWrite.Close()

        stdoutRead, stdoutWrite, err := os.Pipe()
        if err != nil {
                return -1, "", err
        }
        cmd.Stdout = stdoutWrite
        defer stdoutRead.Close()
        defer stdoutWrite.Close()

        stderrRead, stderrWrite, err := os.Pipe()
        if err != nil {
                return -1, "", err
        }
        cmd.Stderr = stderrWrite
        defer stderrRead.Close()
        defer stderrWrite.Close()

        // Close parent ends of pipes on timeout. Without this, I/O may hang in the
        // parent process.
        if d, ok := ctx.Deadline(); ok {
                stdinWrite.SetDeadline(d)
                stdoutRead.SetDeadline(d)
                stderrRead.SetDeadline(d)
        }

        // Finish configuring, and then fork & exec the child process.
        p, err := startProcess(ctx, cmd)
        if err != nil {
                return -1, "", err
        }
        defer p.cleanup(ctx)

        // Close the child ends of the pipes in the parent process.
        stdinRead.Close()
        stdoutWrite.Close()
        stderrWrite.Close()

        // Write the descriptor to stdin.
        go func() {
                // Descriptors are usually small enough to fit in a pipe buffer (which is
                // often 64 KiB on Linux) so this write usually won't block on the child
                // process reading stdin. However, synchronously writing to stdin may cause
                // the parent to block if the descriptor is larger than the pipe buffer and
                // the child process doesn't read stdin. Therefore, we write to stdin
                // asynchronously, limited by the stdinWrite deadline set above.
                err := json.NewEncoder(stdinWrite).Encode(desc)
                if err != nil {
                        // This may error out with a "broken pipe" error if the descriptor is
                        // larger than the pipe buffer and the child process does not read all
                        // of stdin.
                        log.G(ctx).WithError(err).Warn("failed to completely write descriptor to stdin")
                }
                stdinWrite.Close()
        }()

        // Pipe verifier stderr lines to debug logs.
        stderrLog := log.G(ctx).Logger.WithFields(log.Fields{
                "image_verifier": bin,
                "stream":         "stderr",
        })
        stderrLogDone := make(chan struct{})
        go func() {
                defer close(stderrLogDone)
                defer stderrRead.Close()
                lr := &io.LimitedReader{
                        R: stderrRead,
                        N: outputLimitBytes,
                }

                s := bufio.NewScanner(lr)
                for s.Scan() {
                        stderrLog.Debug(s.Text())
                }
                if err := s.Err(); err != nil {
                        stderrLog.WithError(err).Debug("error logging image verifier stderr")
                }

                if lr.N == 0 {
                        // Peek ahead to see if stderr reader was truncated.
                        b := make([]byte, 1)
                        if n, _ := stderrRead.Read(b); n > 0 {
                                stderrLog.Debug("(previous logs may be truncated)")
                        }
                }

                // Discard the truncated part of stderr. Doing this rather than closing the
                // reader avoids broken pipe errors. This is bounded by the stderrRead
                // deadline.
                if _, err := io.Copy(io.Discard, stderrRead); err != nil {
                        log.G(ctx).WithError(err).Error("error flushing stderr")
                }
        }()

        stdout, err := io.ReadAll(io.LimitReader(stdoutRead, outputLimitBytes))
        if err != nil {
                log.G(ctx).WithError(err).Error("error reading stdout")
        } else {
                m := strings.Builder{}
                m.WriteString(strings.TrimSpace(string(stdout)))
                // Peek ahead to see if stdout is truncated.
                b := make([]byte, 1)
                if n, _ := stdoutRead.Read(b); n > 0 {
                        m.WriteString("(stdout truncated)")
                }
                reason = m.String()
        }

        // Discard the truncated part of stdout. Doing this rather than closing the
        // reader avoids broken pipe errors. This is bounded by the stdoutRead
        // deadline.
        if _, err := io.Copy(io.Discard, stdoutRead); err != nil {
                log.G(ctx).WithError(err).Error("error flushing stdout")
        }
        stdoutRead.Close()

        <-stderrLogDone
        if err := cmd.Wait(); err != nil {
                if ee := (&exec.ExitError{}); errors.As(err, &ee) && ee.ProcessState.Exited() {
                        return ee.ProcessState.ExitCode(), reason, nil
                }
                return -1, "", fmt.Errorf("waiting on command to exit: %v", err)
        }

        return cmd.ProcessState.ExitCode(), reason, nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package bindir

import (
        "context"
        "fmt"
        "os/exec"

        "golang.org/x/sys/unix"
)

type process struct {
        cmd *exec.Cmd
}

// Configure the verifier command so that killing it kills all child
// processes of the verifier process.
func startProcess(ctx context.Context, cmd *exec.Cmd) (*process, error) {
        // Assign the verifier a new process group so that killing its process group
        // in Cancel() doesn't kill the parent process (containerd).
        cmd.SysProcAttr = &unix.SysProcAttr{Setpgid: true}

        cmd.Cancel = func() error {
                // Passing a negative PID causes kill(2) to kill all processes in the
                // process group whose ID is cmd.Process.Pid.
                return unix.Kill(-cmd.Process.Pid, unix.SIGKILL)
        }

        if err := cmd.Start(); err != nil {
                return nil, fmt.Errorf("starting process: %w", err)
        }

        return &process{
                cmd: cmd,
        }, nil
}

func (p *process) cleanup(ctx context.Context) {}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package ioutil

import "io"

// writeCloseInformer wraps a reader with a close function.
type wrapReadCloser struct {
        reader *io.PipeReader
        writer *io.PipeWriter
}

// NewWrapReadCloser creates a wrapReadCloser from a reader.
// NOTE(random-liu): To avoid goroutine leakage, the reader passed in
// must be eventually closed by the caller.
func NewWrapReadCloser(r io.Reader) io.ReadCloser {
        pr, pw := io.Pipe()
        go func() {
                _, _ = io.Copy(pw, r)
                pr.Close()
                pw.Close()
        }()
        return &wrapReadCloser{
                reader: pr,
                writer: pw,
        }
}

// Read reads up to len(p) bytes into p.
func (w *wrapReadCloser) Read(p []byte) (int, error) {
        n, err := w.reader.Read(p)
        if err == io.ErrClosedPipe {
                return n, io.EOF
        }
        return n, err
}

// Close closes read closer.
func (w *wrapReadCloser) Close() error {
        w.reader.Close()
        w.writer.Close()
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package ioutil

import (
        "io"
        "sync"
)

// writeCloseInformer wraps passed in write closer with a close channel.
// Caller could wait on the close channel for the write closer to be
// closed.
type writeCloseInformer struct {
        close chan struct{}
        wc    io.WriteCloser
}

// NewWriteCloseInformer creates the writeCloseInformer from a write closer.
func NewWriteCloseInformer(wc io.WriteCloser) (io.WriteCloser, <-chan struct{}) {
        close := make(chan struct{})
        return &writeCloseInformer{
                close: close,
                wc:    wc,
        }, close
}

// Write passes through the data into the internal write closer.
func (w *writeCloseInformer) Write(p []byte) (int, error) {
        return w.wc.Write(p)
}

// Close closes the internal write closer and inform the close channel.
func (w *writeCloseInformer) Close() error {
        err := w.wc.Close()
        close(w.close)
        return err
}

// nopWriteCloser wraps passed in writer with a nop close function.
type nopWriteCloser struct {
        w io.Writer
}

// NewNopWriteCloser creates the nopWriteCloser from a writer.
func NewNopWriteCloser(w io.Writer) io.WriteCloser {
        return &nopWriteCloser{w: w}
}

// Write passes through the data into the internal writer.
func (n *nopWriteCloser) Write(p []byte) (int, error) {
        return n.w.Write(p)
}

// Close is a nop close function.
func (n *nopWriteCloser) Close() error {
        return nil
}

// serialWriteCloser wraps a write closer and makes sure all writes
// are done in serial.
// Parallel write won't intersect with each other. Use case:
//  1. Pipe: Write content longer than PIPE_BUF.
//     See http://man7.org/linux/man-pages/man7/pipe.7.html
//  2. <3.14 Linux Kernel: write is not atomic
//     See http://man7.org/linux/man-pages/man2/write.2.html
type serialWriteCloser struct {
        mu sync.Mutex
        wc io.WriteCloser
}

// NewSerialWriteCloser creates a SerialWriteCloser from a write closer.
func NewSerialWriteCloser(wc io.WriteCloser) io.WriteCloser {
        return &serialWriteCloser{wc: wc}
}

// Write writes a group of byte arrays in order atomically.
func (s *serialWriteCloser) Write(data []byte) (int, error) {
        s.mu.Lock()
        defer s.mu.Unlock()
        return s.wc.Write(data)
}

// Close closes the write closer.
func (s *serialWriteCloser) Close() error {
        s.mu.Lock()
        defer s.mu.Unlock()
        return s.wc.Close()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package ioutil

import (
        "errors"
        "io"
        "sync"
)

// WriterGroup is a group of writers. Writer could be dynamically
// added and removed.
type WriterGroup struct {
        mu      sync.Mutex
        writers map[string]io.WriteCloser
        closed  bool
}

var _ io.Writer = &WriterGroup{}

// NewWriterGroup creates an empty writer group.
func NewWriterGroup() *WriterGroup {
        return &WriterGroup{
                writers: make(map[string]io.WriteCloser),
        }
}

// Add adds a writer into the group. The writer will be closed
// if the writer group is closed.
func (g *WriterGroup) Add(key string, w io.WriteCloser) {
        g.mu.Lock()
        defer g.mu.Unlock()
        if g.closed {
                w.Close()
                return
        }
        g.writers[key] = w
}

// Get gets a writer from the group, returns nil if the writer
// doesn't exist.
func (g *WriterGroup) Get(key string) io.WriteCloser {
        g.mu.Lock()
        defer g.mu.Unlock()
        return g.writers[key]
}

// Remove removes a writer from the group.
func (g *WriterGroup) Remove(key string) {
        g.mu.Lock()
        defer g.mu.Unlock()
        w, ok := g.writers[key]
        if !ok {
                return
        }
        w.Close()
        delete(g.writers, key)
}

// Write writes data into each writer. If a writer returns error,
// it will be closed and removed from the writer group. It returns
// error if writer group is empty.
func (g *WriterGroup) Write(p []byte) (int, error) {
        g.mu.Lock()
        defer g.mu.Unlock()
        for k, w := range g.writers {
                n, err := w.Write(p)
                if err == nil && len(p) == n {
                        continue
                }
                // The writer is closed or in bad state, remove it.
                w.Close()
                delete(g.writers, k)
        }
        if len(g.writers) == 0 {
                return 0, errors.New("writer group is empty")
        }
        return len(p), nil
}

// Close closes the writer group. Write will return error after
// closed.
func (g *WriterGroup) Close() {
        g.mu.Lock()
        defer g.mu.Unlock()
        for _, w := range g.writers {
                w.Close()
        }
        g.writers = nil
        g.closed = true
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
   File copied and customized based on
   https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
*/

package kernelversion

import (
        "bytes"
        "fmt"
        "sync"

        "golang.org/x/sys/unix"
)

// KernelVersion holds information about the kernel.
type KernelVersion struct {
        Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
        Major  uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
}

// String implements fmt.Stringer for KernelVersion
func (k *KernelVersion) String() string {
        if k.Kernel > 0 || k.Major > 0 {
                return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
        }
        return ""
}

var (
        currentKernelVersion *KernelVersion
        kernelVersionError   error
        once                 sync.Once
)

// getKernelVersion gets the current kernel version.
func getKernelVersion() (*KernelVersion, error) {
        once.Do(func() {
                var uts unix.Utsname
                if err := unix.Uname(&uts); err != nil {
                        return
                }
                // Remove the \x00 from the release for Atoi to parse correctly
                currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
        })
        return currentKernelVersion, kernelVersionError
}

// parseRelease parses a string and creates a KernelVersion based on it.
func parseRelease(release string) (*KernelVersion, error) {
        var version = KernelVersion{}

        // We're only make sure we get the "kernel" and "major revision". Sometimes we have
        // 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
        _, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
        if err != nil {
                return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
        }
        return &version, nil
}

// GreaterEqualThan checks if the host's kernel version is greater than, or
// equal to the given kernel version v. Only "kernel version" and "major revision"
// can be specified (e.g., "3.12") and will be taken into account, which means
// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
        kv, err := getKernelVersion()
        if err != nil {
                return false, err
        }
        if kv.Kernel > minVersion.Kernel {
                return true, nil
        }
        if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
                return true, nil
        }
        return false, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package labels

import (
        "fmt"

        "github.com/containerd/errdefs"
)

const (
        maxSize = 4096
        // maximum length of key portion of error message if len of key + len of value > maxSize
        keyMaxLen = 64
)

// Validate a label's key and value are under 4096 bytes
func Validate(k, v string) error {
        total := len(k) + len(v)
        if total > maxSize {
                if len(k) > keyMaxLen {
                        k = k[:keyMaxLen]
                }
                return fmt.Errorf("label key and value length (%d bytes) greater than maximum size (%d bytes), key: %s: %w", total, maxSize, k, errdefs.ErrInvalidArgument)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package namespaces

import (
        "context"
        "fmt"
        "os"

        "github.com/containerd/containerd/v2/pkg/identifiers"
        "github.com/containerd/errdefs"
)

const (
        // NamespaceEnvVar is the environment variable key name
        NamespaceEnvVar = "CONTAINERD_NAMESPACE"
        // Default is the name of the default namespace
        Default = "default"
)

type namespaceKey struct{}

// WithNamespace sets a given namespace on the context
func WithNamespace(ctx context.Context, namespace string) context.Context {
        ctx = context.WithValue(ctx, namespaceKey{}, namespace) // set our key for namespace
        // also store on the grpc and ttrpc headers so it gets picked up by any clients that
        // are using this.
        return withTTRPCNamespaceHeader(withGRPCNamespaceHeader(ctx, namespace), namespace)
}

// NamespaceFromEnv uses the namespace defined in CONTAINERD_NAMESPACE or
// default
func NamespaceFromEnv(ctx context.Context) context.Context {
        namespace := os.Getenv(NamespaceEnvVar)
        if namespace == "" {
                namespace = Default
        }
        return WithNamespace(ctx, namespace)
}

// Namespace returns the namespace from the context.
//
// The namespace is not guaranteed to be valid.
func Namespace(ctx context.Context) (string, bool) {
        namespace, ok := ctx.Value(namespaceKey{}).(string)
        if !ok {
                if namespace, ok = fromGRPCHeader(ctx); !ok {
                        return fromTTRPCHeader(ctx)
                }
        }
        return namespace, ok
}

// NamespaceRequired returns the valid namespace from the context or an error.
func NamespaceRequired(ctx context.Context) (string, error) {
        namespace, ok := Namespace(ctx)
        if !ok || namespace == "" {
                return "", fmt.Errorf("namespace is required: %w", errdefs.ErrFailedPrecondition)
        }
        if err := identifiers.Validate(namespace); err != nil {
                return "", fmt.Errorf("namespace validation: %w", err)
        }
        return namespace, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package namespaces

import (
        "context"

        "google.golang.org/grpc/metadata"
)

const (
        // GRPCHeader defines the header name for specifying a containerd namespace.
        GRPCHeader = "containerd-namespace"
)

// NOTE(stevvooe): We can stub this file out if we don't want a grpc dependency here.

func withGRPCNamespaceHeader(ctx context.Context, namespace string) context.Context {
        // also store on the grpc headers so it gets picked up by any clients that
        // are using this.
        nsheader := metadata.Pairs(GRPCHeader, namespace)
        md, ok := metadata.FromOutgoingContext(ctx) // merge with outgoing context.
        if !ok {
                md = nsheader
        } else {
                // order ensures the latest is first in this list.
                md = metadata.Join(nsheader, md)
        }

        return metadata.NewOutgoingContext(ctx, md)
}

func fromGRPCHeader(ctx context.Context) (string, bool) {
        // try to extract for use in grpc servers.
        md, ok := metadata.FromIncomingContext(ctx)
        if !ok {
                // TODO(stevvooe): Check outgoing context?
                return "", false
        }

        values := md[GRPCHeader]
        if len(values) == 0 {
                return "", false
        }

        return values[0], true
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package namespaces

import (
        "context"

        "github.com/containerd/ttrpc"
)

const (
        // TTRPCHeader defines the header name for specifying a containerd namespace
        TTRPCHeader = "containerd-namespace-ttrpc"
)

func copyMetadata(src ttrpc.MD) ttrpc.MD {
        md := ttrpc.MD{}
        for k, v := range src {
                md[k] = append(md[k], v...)
        }
        return md
}

func withTTRPCNamespaceHeader(ctx context.Context, namespace string) context.Context {
        md, ok := ttrpc.GetMetadata(ctx)
        if !ok {
                md = ttrpc.MD{}
        } else {
                md = copyMetadata(md)
        }
        md.Set(TTRPCHeader, namespace)
        return ttrpc.WithMetadata(ctx, md)
}

func fromTTRPCHeader(ctx context.Context) (string, bool) {
        return ttrpc.GetMetadataValue(ctx, TTRPCHeader)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Copyright 2018 CNI authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netns

import (
        "crypto/rand"
        "fmt"
        "os"
        "path"
        "runtime"
        "sync"

        "github.com/containerd/containerd/v2/core/mount"
        cnins "github.com/containernetworking/plugins/pkg/ns"
        "github.com/moby/sys/symlink"
        "golang.org/x/sys/unix"
)

// Some of the following functions are migrated from
// https://github.com/containernetworking/plugins/blob/main/pkg/testutils/netns_linux.go

// newNS creates a new persistent (bind-mounted) network namespace and returns the
// path to the network namespace.
// If pid is not 0, returns the netns from that pid persistently mounted. Otherwise,
// a new netns is created.
func newNS(baseDir string, pid uint32) (nsPath string, err error) {
        b := make([]byte, 16)

        _, err = rand.Read(b)
        if err != nil {
                return "", fmt.Errorf("failed to generate random netns name: %w", err)
        }

        // Create the directory for mounting network namespaces
        // This needs to be a shared mountpoint in case it is mounted in to
        // other namespaces (containers)
        if err := os.MkdirAll(baseDir, 0755); err != nil {
                return "", err
        }

        // create an empty file at the mount point and fail if it already exists
        nsName := fmt.Sprintf("cni-%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:])
        nsPath = path.Join(baseDir, nsName)
        mountPointFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
        if err != nil {
                return "", err
        }
        mountPointFd.Close()

        defer func() {
                // Ensure the mount point is cleaned up on errors
                if err != nil {
                        os.RemoveAll(nsPath)
                }
        }()

        if pid != 0 {
                procNsPath := getNetNSPathFromPID(pid)
                // bind mount the netns onto the mount point. This causes the namespace
                // to persist, even when there are no threads in the ns.
                if err = unix.Mount(procNsPath, nsPath, "none", unix.MS_BIND, ""); err != nil {
                        return "", fmt.Errorf("failed to bind mount ns src: %v at %s: %w", procNsPath, nsPath, err)
                }
                return nsPath, nil
        }

        var wg sync.WaitGroup
        wg.Add(1)

        // do namespace work in a dedicated goroutine, so that we can safely
        // Lock/Unlock OSThread without upsetting the lock/unlock state of
        // the caller of this function
        go (func() {
                defer wg.Done()
                runtime.LockOSThread()
                // Don't unlock. By not unlocking, golang will kill the OS thread when the
                // goroutine is done (for go1.10+)

                var origNS cnins.NetNS
                origNS, err = cnins.GetNS(getCurrentThreadNetNSPath())
                if err != nil {
                        return
                }
                defer origNS.Close()

                // create a new netns on the current thread
                err = unix.Unshare(unix.CLONE_NEWNET)
                if err != nil {
                        return
                }

                // Put this thread back to the orig ns, since it might get reused (pre go1.10)
                defer origNS.Set()

                // bind mount the netns from the current thread (from /proc) onto the
                // mount point. This causes the namespace to persist, even when there
                // are no threads in the ns.
                err = unix.Mount(getCurrentThreadNetNSPath(), nsPath, "none", unix.MS_BIND, "")
                if err != nil {
                        err = fmt.Errorf("failed to bind mount ns at %s: %w", nsPath, err)
                }
        })()
        wg.Wait()

        if err != nil {
                return "", fmt.Errorf("failed to create namespace: %w", err)
        }

        return nsPath, nil
}

// unmountNS unmounts the NS held by the netns object. unmountNS is idempotent.
func unmountNS(path string) error {
        if _, err := os.Stat(path); err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return fmt.Errorf("failed to stat netns: %w", err)
        }
        path, err := symlink.FollowSymlinkInScope(path, "/")
        if err != nil {
                return fmt.Errorf("failed to follow symlink: %w", err)
        }
        if err := mount.Unmount(path, unix.MNT_DETACH); err != nil && !os.IsNotExist(err) {
                return fmt.Errorf("failed to umount netns: %w", err)
        }
        if err := os.RemoveAll(path); err != nil {
                return fmt.Errorf("failed to remove netns: %w", err)
        }
        return nil
}

// getCurrentThreadNetNSPath copied from pkg/ns
func getCurrentThreadNetNSPath() string {
        // /proc/self/ns/net returns the namespace of the main thread, not
        // of whatever thread this goroutine is running on.  Make sure we
        // use the thread's net namespace since the thread is switching around
        return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid())
}

func getNetNSPathFromPID(pid uint32) string {
        return fmt.Sprintf("/proc/%d/ns/net", pid)
}

// NetNS holds network namespace.
type NetNS struct {
        path string
}

// NewNetNS creates a network namespace.
// The name of the network namespace is randomly generated.
// The returned netns is created under baseDir, with its path
// following the pattern "baseDir/<generated-name>".
func NewNetNS(baseDir string) (*NetNS, error) {
        return NewNetNSFromPID(baseDir, 0)
}

// NewNetNSFromPID returns the netns from pid or a new netns if pid is 0.
// The name of the network namespace is randomly generated.
// The returned netns is created under baseDir, with its path
// following the pattern "baseDir/<generated-name>".
func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) {
        path, err := newNS(baseDir, pid)
        if err != nil {
                return nil, fmt.Errorf("failed to setup netns: %w", err)
        }
        return &NetNS{path: path}, nil
}

// LoadNetNS loads existing network namespace.
func LoadNetNS(path string) *NetNS {
        return &NetNS{path: path}
}

// Remove removes network namespace. Remove is idempotent, meaning it might
// be invoked multiple times and provides consistent result.
func (n *NetNS) Remove() error {
        return unmountNS(n.path)
}

// Closed checks whether the network namespace has been closed.
func (n *NetNS) Closed() (bool, error) {
        ns, err := cnins.GetNS(n.path)
        if err != nil {
                if _, ok := err.(cnins.NSPathNotExistErr); ok {
                        // The network namespace has already been removed.
                        return true, nil
                }
                if _, ok := err.(cnins.NSPathNotNSErr); ok {
                        // The network namespace is not mounted, remove it.
                        if err := os.RemoveAll(n.path); err != nil {
                                return false, fmt.Errorf("remove netns: %w", err)
                        }
                        return true, nil
                }
                return false, fmt.Errorf("get netns fd: %w", err)
        }
        if err := ns.Close(); err != nil {
                return false, fmt.Errorf("close netns fd: %w", err)
        }
        return false, nil
}

// GetPath returns network namespace path for sandbox container
func (n *NetNS) GetPath() string {
        return n.path
}

// Do runs a function in the network namespace.
func (n *NetNS) Do(f func(cnins.NetNS) error) error {
        ns, err := cnins.GetNS(n.path)
        if err != nil {
                return fmt.Errorf("get netns fd: %w", err)
        }
        defer ns.Close()
        return ns.Do(f)
}

//go:build !freebsd

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package oci

import (
        specs "github.com/opencontainers/runtime-spec/specs-go"
)

func defaultMounts() []specs.Mount {
        return []specs.Mount{
                {
                        Destination: "/proc",
                        Type:        "proc",
                        Source:      "proc",
                        Options:     []string{"nosuid", "noexec", "nodev"},
                },
                {
                        Destination: "/dev",
                        Type:        "tmpfs",
                        Source:      "tmpfs",
                        Options:     []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
                },
                {
                        Destination: "/dev/pts",
                        Type:        "devpts",
                        Source:      "devpts",
                        Options:     []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
                },
                {
                        Destination: "/dev/shm",
                        Type:        "tmpfs",
                        Source:      "shm",
                        Options:     []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
                },
                {
                        Destination: "/dev/mqueue",
                        Type:        "mqueue",
                        Source:      "mqueue",
                        Options:     []string{"nosuid", "noexec", "nodev"},
                },
                {
                        Destination: "/sys",
                        Type:        "sysfs",
                        Source:      "sysfs",
                        Options:     []string{"nosuid", "noexec", "nodev", "ro"},
                },
                {
                        Destination: "/run",
                        Type:        "tmpfs",
                        Source:      "tmpfs",
                        Options:     []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
                },
        }
}

// appendOSMounts is only used on FreeBSD, and a no-op on other platforms.
func appendOSMounts(_ *Spec, _ string) {}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package oci

import (
        "context"
        "encoding/json"
        "os"
        "path/filepath"
        "runtime"

        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/opencontainers/runtime-spec/specs-go"

        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/platforms"
)

const (
        rwm               = "rwm"
        defaultRootfsPath = "rootfs"
)

var (
        defaultUnixEnv = []string{
                "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
        }
)

// Spec is a type alias to the OCI runtime spec to allow third part SpecOpts
// to be created without the "issues" with go vendoring and package imports
type Spec = specs.Spec

const ConfigFilename = "config.json"

// ReadSpec deserializes JSON into an OCI runtime Spec from a given path.
func ReadSpec(path string) (*Spec, error) {
        f, err := os.Open(path)
        if err != nil {
                return nil, err
        }
        defer f.Close()
        var s Spec
        if err := json.NewDecoder(f).Decode(&s); err != nil {
                return nil, err
        }
        return &s, nil
}

// GenerateSpec will generate a default spec from the provided image
// for use as a containerd container
func GenerateSpec(ctx context.Context, client Client, c *containers.Container, opts ...SpecOpts) (*Spec, error) {
        return GenerateSpecWithPlatform(ctx, client, platforms.DefaultString(), c, opts...)
}

// GenerateSpecWithPlatform will generate a default spec from the provided image
// for use as a containerd container in the platform requested.
func GenerateSpecWithPlatform(ctx context.Context, client Client, platform string, c *containers.Container, opts ...SpecOpts) (*Spec, error) {
        var s Spec
        if err := generateDefaultSpecWithPlatform(ctx, platform, c.ID, &s); err != nil {
                return nil, err
        }

        return &s, ApplyOpts(ctx, client, c, &s, opts...)
}

func generateDefaultSpecWithPlatform(ctx context.Context, platform, id string, s *Spec) error {
        plat, err := platforms.Parse(platform)
        if err != nil {
                return err
        }

        switch plat.OS {
        case "windows":
                err = populateDefaultWindowsSpec(ctx, s, id)
        case "darwin":
                err = populateDefaultDarwinSpec(s)
        default:
                err = populateDefaultUnixSpec(ctx, s, id)
                if err == nil && runtime.GOOS == "windows" {
                        // To run LCOW we have a Linux and Windows section. Add an empty one now.
                        s.Windows = &specs.Windows{}
                }
        }

        return err
}

// ApplyOpts applies the options to the given spec, injecting data from the
// context, client and container instance.
func ApplyOpts(ctx context.Context, client Client, c *containers.Container, s *Spec, opts ...SpecOpts) error {
        for _, o := range opts {
                if err := o(ctx, client, c, s); err != nil {
                        return err
                }
        }

        return nil
}

func defaultUnixCaps() []string {
        return []string{
                "CAP_CHOWN",
                "CAP_DAC_OVERRIDE",
                "CAP_FSETID",
                "CAP_FOWNER",
                "CAP_MKNOD",
                "CAP_NET_RAW",
                "CAP_SETGID",
                "CAP_SETUID",
                "CAP_SETFCAP",
                "CAP_SETPCAP",
                "CAP_NET_BIND_SERVICE",
                "CAP_SYS_CHROOT",
                "CAP_KILL",
                "CAP_AUDIT_WRITE",
        }
}

func defaultUnixNamespaces() []specs.LinuxNamespace {
        return []specs.LinuxNamespace{
                {
                        Type: specs.PIDNamespace,
                },
                {
                        Type: specs.IPCNamespace,
                },
                {
                        Type: specs.UTSNamespace,
                },
                {
                        Type: specs.MountNamespace,
                },
                {
                        Type: specs.NetworkNamespace,
                },
        }
}

func populateDefaultUnixSpec(ctx context.Context, s *Spec, id string) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }

        *s = Spec{
                Version: specs.Version,
                Root: &specs.Root{
                        Path: defaultRootfsPath,
                },
                Process: &specs.Process{
                        Cwd:             "/",
                        NoNewPrivileges: true,
                        User: specs.User{
                                UID: 0,
                                GID: 0,
                        },
                        Capabilities: &specs.LinuxCapabilities{
                                Bounding:  defaultUnixCaps(),
                                Permitted: defaultUnixCaps(),
                                Effective: defaultUnixCaps(),
                        },
                        Rlimits: []specs.POSIXRlimit{
                                {
                                        Type: "RLIMIT_NOFILE",
                                        Hard: uint64(1024),
                                        Soft: uint64(1024),
                                },
                        },
                },
                Linux: &specs.Linux{
                        MaskedPaths: []string{
                                "/proc/acpi",
                                "/proc/asound",
                                "/proc/kcore",
                                "/proc/keys",
                                "/proc/latency_stats",
                                "/proc/timer_list",
                                "/proc/timer_stats",
                                "/proc/sched_debug",
                                "/sys/firmware",
                                "/sys/devices/virtual/powercap",
                                "/proc/scsi",
                        },
                        ReadonlyPaths: []string{
                                "/proc/bus",
                                "/proc/fs",
                                "/proc/irq",
                                "/proc/sys",
                                "/proc/sysrq-trigger",
                        },
                        CgroupsPath: filepath.Join("/", ns, id),
                        Resources: &specs.LinuxResources{
                                Devices: []specs.LinuxDeviceCgroup{
                                        {
                                                Allow:  false,
                                                Access: rwm,
                                        },
                                },
                        },
                        Namespaces: defaultUnixNamespaces(),
                },
        }
        s.Mounts = defaultMounts()
        return nil
}

func populateDefaultWindowsSpec(ctx context.Context, s *Spec, id string) error {
        *s = Spec{
                Version: specs.Version,
                Root:    &specs.Root{},
                Process: &specs.Process{
                        Cwd: `C:\`,
                },
                Windows: &specs.Windows{},
        }
        return nil
}

func populateDefaultDarwinSpec(s *Spec) error {
        *s = Spec{
                Version: specs.Version,
                Root:    &specs.Root{},
                Process: &specs.Process{Cwd: "/"},
        }
        return nil
}

// DescriptorFromProto converts containerds protobuf [types.Descriptor]
// to the OCI image specs [ocispec.Descriptor].
func DescriptorFromProto(d *types.Descriptor) ocispec.Descriptor {
        return ocispec.Descriptor{
                MediaType:   d.MediaType,
                Digest:      digest.Digest(d.Digest),
                Size:        d.Size,
                Annotations: d.Annotations,
        }
}

// DescriptorToProto converts the OCI image specs [ocispec.Descriptor]
// to containerds protobuf [types.Descriptor].
func DescriptorToProto(d ocispec.Descriptor) *types.Descriptor {
        return &types.Descriptor{
                MediaType:   d.MediaType,
                Digest:      d.Digest.String(),
                Size:        d.Size,
                Annotations: d.Annotations,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package oci

import (
        "bufio"
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "runtime"
        "strconv"
        "strings"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/continuity/fs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/moby/sys/user"
        v1 "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/opencontainers/runtime-spec/specs-go"
        "tags.cncf.io/container-device-interface/pkg/cdi"
)

// SpecOpts sets spec specific information to a newly generated OCI spec
type SpecOpts func(context.Context, Client, *containers.Container, *Spec) error

// Compose converts a sequence of spec operations into a single operation
func Compose(opts ...SpecOpts) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
                for _, o := range opts {
                        if err := o(ctx, client, c, s); err != nil {
                                return err
                        }
                }
                return nil
        }
}

// setProcess sets Process to empty if unset
func setProcess(s *Spec) {
        if s.Process == nil {
                s.Process = &specs.Process{}
        }
}

// setRoot sets Root to empty if unset
func setRoot(s *Spec) {
        if s.Root == nil {
                s.Root = &specs.Root{}
        }
}

// setLinux sets Linux to empty if unset
func setLinux(s *Spec) {
        if s.Linux == nil {
                s.Linux = &specs.Linux{}
        }
}

func setResources(s *Spec) {
        if s.Linux != nil {
                if s.Linux.Resources == nil {
                        s.Linux.Resources = &specs.LinuxResources{}
                }
        }
}

//nolint:nolintlint,unused // not used on all platforms
func setResourcesWindows(s *Spec) {
        if s.Windows != nil {
                if s.Windows.Resources == nil {
                        s.Windows.Resources = &specs.WindowsResources{}
                }
        }
}

//nolint:nolintlint,unused // not used on all platforms
func setCPU(s *Spec) {
        setResources(s)
        if s.Linux != nil {
                if s.Linux.Resources.CPU == nil {
                        s.Linux.Resources.CPU = &specs.LinuxCPU{}
                }
        }
}

//nolint:nolintlint,unused // not used on all platforms
func setCPUWindows(s *Spec) {
        setResourcesWindows(s)
        if s.Windows != nil {
                if s.Windows.Resources.CPU == nil {
                        s.Windows.Resources.CPU = &specs.WindowsCPUResources{}
                }
        }
}

// setCapabilities sets Linux Capabilities to empty if unset
func setCapabilities(s *Spec) {
        setProcess(s)
        if s.Process.Capabilities == nil {
                s.Process.Capabilities = &specs.LinuxCapabilities{}
        }
}

// ensureAdditionalGids ensures that the primary GID is also included in the additional GID list.
func ensureAdditionalGids(s *Spec) {
        setProcess(s)
        for _, f := range s.Process.User.AdditionalGids {
                if f == s.Process.User.GID {
                        return
                }
        }
        s.Process.User.AdditionalGids = append([]uint32{s.Process.User.GID}, s.Process.User.AdditionalGids...)
}

// WithDefaultSpec returns a SpecOpts that will populate the spec with default
// values.
//
// Use as the first option to clear the spec, then apply options afterwards.
func WithDefaultSpec() SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                return generateDefaultSpecWithPlatform(ctx, platforms.DefaultString(), c.ID, s)
        }
}

// WithDefaultSpecForPlatform returns a SpecOpts that will populate the spec
// with default values for a given platform.
//
// Use as the first option to clear the spec, then apply options afterwards.
func WithDefaultSpecForPlatform(platform string) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                return generateDefaultSpecWithPlatform(ctx, platform, c.ID, s)
        }
}

// WithSpecFromBytes loads the spec from the provided byte slice.
func WithSpecFromBytes(p []byte) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                *s = Spec{} // make sure spec is cleared.
                if err := json.Unmarshal(p, s); err != nil {
                        return fmt.Errorf("decoding spec config file failed, current supported OCI runtime-spec : v%s: %w", specs.Version, err)
                }
                return nil
        }
}

// WithSpecFromFile loads the specification from the provided filename.
func WithSpecFromFile(filename string) SpecOpts {
        return func(ctx context.Context, c Client, container *containers.Container, s *Spec) error {
                p, err := os.ReadFile(filename)
                if err != nil {
                        return fmt.Errorf("cannot load spec config file: %w", err)
                }
                return WithSpecFromBytes(p)(ctx, c, container, s)
        }
}

// WithEnv appends environment variables
func WithEnv(environmentVariables []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if len(environmentVariables) > 0 {
                        setProcess(s)
                        s.Process.Env = replaceOrAppendEnvValues(s.Process.Env, environmentVariables)
                }
                return nil
        }
}

// replaceOrAppendEnvValues returns the defaults with the overrides either
// replaced by env key or appended to the list
func replaceOrAppendEnvValues(defaults, overrides []string) []string {
        cache := make(map[string]int, len(defaults))
        results := make([]string, 0, len(defaults))
        for i, e := range defaults {
                k, _, _ := strings.Cut(e, "=")
                results = append(results, e)
                cache[k] = i
        }

        for _, value := range overrides {
                // Values w/o = means they want this env to be removed/unset.
                k, _, ok := strings.Cut(value, "=")
                if !ok {
                        if i, exists := cache[k]; exists {
                                results[i] = "" // Used to indicate it should be removed
                        }
                        continue
                }

                // Just do a normal set/update
                if i, exists := cache[k]; exists {
                        results[i] = value
                } else {
                        results = append(results, value)
                }
        }

        // Now remove all entries that we want to "unset"
        for i := 0; i < len(results); i++ {
                if results[i] == "" {
                        results = append(results[:i], results[i+1:]...)
                        i--
                }
        }

        return results
}

// WithProcessArgs replaces the args on the generated spec
func WithProcessArgs(args ...string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setProcess(s)
                s.Process.Args = args
                s.Process.CommandLine = ""
                return nil
        }
}

// WithProcessCwd replaces the current working directory on the generated spec
func WithProcessCwd(cwd string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setProcess(s)
                s.Process.Cwd = cwd
                return nil
        }
}

// WithTTY sets the information on the spec as well as the environment variables for
// using a TTY
func WithTTY(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setProcess(s)
        s.Process.Terminal = true
        if s.Linux != nil {
                s.Process.Env = append(s.Process.Env, "TERM=xterm")
        }

        return nil
}

// WithTTYSize sets the information on the spec as well as the environment variables for
// using a TTY
func WithTTYSize(width, height int) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setProcess(s)
                if s.Process.ConsoleSize == nil {
                        s.Process.ConsoleSize = &specs.Box{}
                }
                s.Process.ConsoleSize.Width = uint(width)
                s.Process.ConsoleSize.Height = uint(height)
                return nil
        }
}

// WithHostname sets the container's hostname
func WithHostname(name string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                s.Hostname = name
                return nil
        }
}

// WithDomainname sets the container's NIS domain name
func WithDomainname(name string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                s.Domainname = name
                return nil
        }
}

// WithMounts appends mounts
func WithMounts(mounts []specs.Mount) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                s.Mounts = append(s.Mounts, mounts...)
                return nil
        }
}

// WithoutMounts removes mounts
func WithoutMounts(dests ...string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                var (
                        mounts  []specs.Mount
                        current = s.Mounts
                )
        mLoop:
                for _, m := range current {
                        mDestination := filepath.Clean(m.Destination)
                        for _, dest := range dests {
                                if mDestination == dest {
                                        continue mLoop
                                }
                        }
                        mounts = append(mounts, m)
                }
                s.Mounts = mounts
                return nil
        }
}

// WithHostNamespace allows a task to run inside the host's linux namespace
func WithHostNamespace(ns specs.LinuxNamespaceType) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                for i, n := range s.Linux.Namespaces {
                        if n.Type == ns {
                                s.Linux.Namespaces = append(s.Linux.Namespaces[:i], s.Linux.Namespaces[i+1:]...)
                                return nil
                        }
                }
                return nil
        }
}

// WithLinuxNamespace uses the passed in namespace for the spec. If a namespace of the same type already exists in the
// spec, the existing namespace is replaced by the one provided.
func WithLinuxNamespace(ns specs.LinuxNamespace) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                for i, n := range s.Linux.Namespaces {
                        if n.Type == ns.Type {
                                s.Linux.Namespaces[i] = ns
                                return nil
                        }
                }
                s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
                return nil
        }
}

// WithNewPrivileges turns off the NoNewPrivileges feature flag in the spec
func WithNewPrivileges(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setProcess(s)
        s.Process.NoNewPrivileges = false

        return nil
}

// WithImageConfig configures the spec to from the configuration of an Image
func WithImageConfig(image Image) SpecOpts {
        return WithImageConfigArgs(image, nil)
}

// WithImageConfigArgs configures the spec to from the configuration of an Image with additional args that
// replaces the CMD of the image
func WithImageConfigArgs(image Image, args []string) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
                ic, err := image.Config(ctx)
                if err != nil {
                        return err
                }
                if !images.IsConfigType(ic.MediaType) {
                        return fmt.Errorf("unknown image config media type %s", ic.MediaType)
                }

                var (
                        imageConfigBytes []byte
                        ociimage         v1.Image
                        config           v1.ImageConfig
                )
                imageConfigBytes, err = content.ReadBlob(ctx, image.ContentStore(), ic)
                if err != nil {
                        return err
                }

                if err = json.Unmarshal(imageConfigBytes, &ociimage); err != nil {
                        return err
                }
                config = ociimage.Config

                appendOSMounts(s, ociimage.OS)
                setProcess(s)
                if s.Linux != nil {
                        defaults := config.Env
                        if len(defaults) == 0 {
                                defaults = defaultUnixEnv
                        }
                        s.Process.Env = replaceOrAppendEnvValues(defaults, s.Process.Env)
                        cmd := config.Cmd
                        if len(args) > 0 {
                                cmd = args
                        }
                        s.Process.Args = append(config.Entrypoint, cmd...)

                        cwd := config.WorkingDir
                        if cwd == "" {
                                cwd = "/"
                        }
                        s.Process.Cwd = cwd
                        if config.User != "" {
                                if err := WithUser(config.User)(ctx, client, c, s); err != nil {
                                        return err
                                }
                                return WithAdditionalGIDs(strconv.FormatInt(int64(s.Process.User.UID), 10))(ctx, client, c, s)
                        }
                        // we should query the image's /etc/group for additional GIDs
                        // even if there is no specified user in the image config
                        return WithAdditionalGIDs("root")(ctx, client, c, s)
                } else if s.Windows != nil {
                        s.Process.Env = replaceOrAppendEnvValues(config.Env, s.Process.Env)

                        // To support Docker ArgsEscaped on Windows we need to combine the
                        // image Entrypoint & (Cmd Or User Args) while taking into account
                        // if Docker has already escaped them in the image config. When
                        // Docker sets `ArgsEscaped==true` in the config it has pre-escaped
                        // either Entrypoint or Cmd or both. Cmd should always be treated as
                        // arguments appended to Entrypoint unless:
                        //
                        // 1. Entrypoint does not exist, in which case Cmd[0] is the
                        // executable.
                        //
                        // 2. The user overrides the Cmd with User Args when activating the
                        // container in which case those args should be appended to the
                        // Entrypoint if it exists.
                        //
                        // To effectively do this we need to know if the arguments came from
                        // the user or if the arguments came from the image config when
                        // ArgsEscaped==true. In this case we only want to escape the
                        // additional user args when forming the complete CommandLine. This
                        // is safe in both cases of Entrypoint or Cmd being set because
                        // Docker will always escape them to an array of length one. Thus in
                        // both cases it is the "executable" portion of the command.
                        //
                        // In the case ArgsEscaped==false, Entrypoint or Cmd will contain
                        // any number of entries that are all unescaped and can simply be
                        // combined (potentially overwriting Cmd with User Args if present)
                        // and forwarded the container start as an Args array.
                        cmd := config.Cmd
                        cmdFromImage := true
                        if len(args) > 0 {
                                cmd = args
                                cmdFromImage = false
                        }

                        cmd = append(config.Entrypoint, cmd...)
                        if len(cmd) == 0 {
                                return errors.New("no arguments specified")
                        }

                        //nolint:staticcheck // ArgsEscaped is deprecated
                        if config.ArgsEscaped && (len(config.Entrypoint) > 0 || cmdFromImage) {
                                s.Process.Args = nil
                                s.Process.CommandLine = cmd[0]
                                if len(cmd) > 1 {
                                        s.Process.CommandLine += " " + escapeAndCombineArgs(cmd[1:])
                                }
                        } else {
                                s.Process.Args = cmd
                                s.Process.CommandLine = ""
                        }

                        s.Process.Cwd = config.WorkingDir
                        s.Process.User = specs.User{
                                Username: config.User,
                        }
                } else {
                        return errors.New("spec does not contain Linux or Windows section")
                }
                return nil
        }
}

// WithRootFSPath specifies unmanaged rootfs path.
func WithRootFSPath(path string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setRoot(s)
                s.Root.Path = path
                // Entrypoint is not set here (it's up to caller)
                return nil
        }
}

// WithRootFSReadonly sets specs.Root.Readonly to true
func WithRootFSReadonly() SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setRoot(s)
                s.Root.Readonly = true
                return nil
        }
}

// WithNoNewPrivileges sets no_new_privileges on the process for the container
func WithNoNewPrivileges(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setProcess(s)
        s.Process.NoNewPrivileges = true
        return nil
}

// WithHostHostsFile bind-mounts the host's /etc/hosts into the container as readonly
func WithHostHostsFile(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        s.Mounts = append(s.Mounts, specs.Mount{
                Destination: "/etc/hosts",
                Type:        "bind",
                Source:      "/etc/hosts",
                Options:     []string{"rbind", "ro"},
        })
        return nil
}

// WithHostResolvconf bind-mounts the host's /etc/resolv.conf into the container as readonly
func WithHostResolvconf(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        s.Mounts = append(s.Mounts, specs.Mount{
                Destination: "/etc/resolv.conf",
                Type:        "bind",
                Source:      "/etc/resolv.conf",
                Options:     []string{"rbind", "ro"},
        })
        return nil
}

// WithHostLocaltime bind-mounts the host's /etc/localtime into the container as readonly
func WithHostLocaltime(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        s.Mounts = append(s.Mounts, specs.Mount{
                Destination: "/etc/localtime",
                Type:        "bind",
                Source:      "/etc/localtime",
                Options:     []string{"rbind", "ro"},
        })
        return nil
}

// WithUserNamespace sets the uid and gid mappings for the task
// this can be called multiple times to add more mappings to the generated spec
func WithUserNamespace(uidMap, gidMap []specs.LinuxIDMapping) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                var hasUserns bool
                setLinux(s)
                for _, ns := range s.Linux.Namespaces {
                        if ns.Type == specs.UserNamespace {
                                hasUserns = true
                                break
                        }
                }
                if !hasUserns {
                        s.Linux.Namespaces = append(s.Linux.Namespaces, specs.LinuxNamespace{
                                Type: specs.UserNamespace,
                        })
                }
                s.Linux.UIDMappings = append(s.Linux.UIDMappings, uidMap...)
                s.Linux.GIDMappings = append(s.Linux.GIDMappings, gidMap...)
                return nil
        }
}

// WithCgroup sets the container's cgroup path
func WithCgroup(path string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                s.Linux.CgroupsPath = path
                return nil
        }
}

// WithNamespacedCgroup uses the namespace set on the context to create a
// root directory for containers in the cgroup with the id as the subcgroup
func WithNamespacedCgroup() SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                namespace, err := namespaces.NamespaceRequired(ctx)
                if err != nil {
                        return err
                }
                setLinux(s)
                s.Linux.CgroupsPath = filepath.Join("/", namespace, c.ID)
                return nil
        }
}

// WithUser sets the user to be used within the container.
// It accepts a valid user string in OCI Image Spec v1.0.0:
//
//        user, uid, user:group, uid:gid, uid:group, user:gid
func WithUser(userstr string) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
                defer ensureAdditionalGids(s)
                setProcess(s)
                s.Process.User.AdditionalGids = nil

                // For LCOW it's a bit harder to confirm that the user actually exists on the host as a rootfs isn't
                // mounted on the host and shared into the guest, but rather the rootfs is constructed entirely in the
                // guest itself. To accommodate this, a spot to place the user string provided by a client as-is is needed.
                // The `Username` field on the runtime spec is marked by Platform as only for Windows, and in this case it
                // *is* being set on a Windows host at least, but will be used as a temporary holding spot until the guest
                // can use the string to perform these same operations to grab the uid:gid inside.
                //
                // Mounts are not supported on Darwin, so using the same workaround.
                if (s.Windows != nil && s.Linux != nil) || runtime.GOOS == "darwin" {
                        s.Process.User.Username = userstr
                        return nil
                }

                parts := strings.Split(userstr, ":")
                switch len(parts) {
                case 1:
                        v, err := strconv.Atoi(parts[0])
                        if err != nil {
                                // if we cannot parse as a uint they try to see if it is a username
                                return WithUsername(userstr)(ctx, client, c, s)
                        }
                        return WithUserID(uint32(v))(ctx, client, c, s)
                case 2:
                        var (
                                username  string
                                groupname string
                        )
                        var uid, gid uint32
                        v, err := strconv.Atoi(parts[0])
                        if err != nil {
                                username = parts[0]
                        } else {
                                uid = uint32(v)
                        }
                        if v, err = strconv.Atoi(parts[1]); err != nil {
                                groupname = parts[1]
                        } else {
                                gid = uint32(v)
                        }
                        if username == "" && groupname == "" {
                                s.Process.User.UID, s.Process.User.GID = uid, gid
                                return nil
                        }
                        f := func(root string) error {
                                if username != "" {
                                        user, err := UserFromPath(root, func(u user.User) bool {
                                                return u.Name == username
                                        })
                                        if err != nil {
                                                return err
                                        }
                                        uid = uint32(user.Uid)
                                }
                                if groupname != "" {
                                        gid, err = GIDFromPath(root, func(g user.Group) bool {
                                                return g.Name == groupname
                                        })
                                        if err != nil {
                                                return err
                                        }
                                }
                                s.Process.User.UID, s.Process.User.GID = uid, gid
                                return nil
                        }
                        if c.Snapshotter == "" && c.SnapshotKey == "" {
                                if !isRootfsAbs(s.Root.Path) {
                                        return errors.New("rootfs absolute path is required")
                                }
                                return f(s.Root.Path)
                        }
                        if c.Snapshotter == "" {
                                return errors.New("no snapshotter set for container")
                        }
                        if c.SnapshotKey == "" {
                                return errors.New("rootfs snapshot not created for container")
                        }
                        snapshotter := client.SnapshotService(c.Snapshotter)
                        mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
                        if err != nil {
                                return err
                        }

                        // Use a read-only mount when trying to get user/group information
                        // from the container's rootfs. Since the option does read operation
                        // only, we append ReadOnly mount option to prevent the Linux kernel
                        // from syncing whole filesystem in umount syscall.
                        return mount.WithReadonlyTempMount(ctx, mounts, f)
                default:
                        return fmt.Errorf("invalid USER value %s", userstr)
                }
        }
}

// WithUIDGID allows the UID and GID for the Process to be set
func WithUIDGID(uid, gid uint32) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                defer ensureAdditionalGids(s)
                setProcess(s)
                s.Process.User.AdditionalGids = nil
                s.Process.User.UID = uid
                s.Process.User.GID = gid
                return nil
        }
}

// WithUserID sets the correct UID and GID for the container based
// on the image's /etc/passwd contents. If /etc/passwd does not exist,
// or uid is not found in /etc/passwd, it sets the requested uid,
// additionally sets the gid to 0, and does not return an error.
func WithUserID(uid uint32) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) (err error) {
                defer ensureAdditionalGids(s)
                setProcess(s)
                s.Process.User.AdditionalGids = nil
                setUser := func(root string) error {
                        user, err := UserFromPath(root, func(u user.User) bool {
                                return u.Uid == int(uid)
                        })
                        if err != nil {
                                if os.IsNotExist(err) || err == ErrNoUsersFound {
                                        s.Process.User.UID, s.Process.User.GID = uid, 0
                                        return nil
                                }
                                return err
                        }
                        s.Process.User.UID, s.Process.User.GID = uint32(user.Uid), uint32(user.Gid)
                        return nil
                }
                if c.Snapshotter == "" && c.SnapshotKey == "" {
                        if !isRootfsAbs(s.Root.Path) {
                                return errors.New("rootfs absolute path is required")
                        }
                        return setUser(s.Root.Path)
                }
                if c.Snapshotter == "" {
                        return errors.New("no snapshotter set for container")
                }
                if c.SnapshotKey == "" {
                        return errors.New("rootfs snapshot not created for container")
                }
                snapshotter := client.SnapshotService(c.Snapshotter)
                mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
                if err != nil {
                        return err
                }

                // Use a read-only mount when trying to get user/group information
                // from the container's rootfs. Since the option does read operation
                // only, we append ReadOnly mount option to prevent the Linux kernel
                // from syncing whole filesystem in umount syscall.
                return mount.WithReadonlyTempMount(ctx, mounts, setUser)
        }
}

// WithUsername sets the correct UID and GID for the container
// based on the image's /etc/passwd contents. If /etc/passwd
// does not exist, or the username is not found in /etc/passwd,
// it returns error. On Windows this sets the username as provided,
// the operating system will validate the user when going to run
// the container.
func WithUsername(username string) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) (err error) {
                defer ensureAdditionalGids(s)
                setProcess(s)
                s.Process.User.AdditionalGids = nil
                if s.Linux != nil {
                        setUser := func(root string) error {
                                user, err := UserFromPath(root, func(u user.User) bool {
                                        return u.Name == username
                                })
                                if err != nil {
                                        return err
                                }
                                s.Process.User.UID, s.Process.User.GID = uint32(user.Uid), uint32(user.Gid)
                                return nil
                        }
                        if c.Snapshotter == "" && c.SnapshotKey == "" {
                                if !isRootfsAbs(s.Root.Path) {
                                        return errors.New("rootfs absolute path is required")
                                }
                                return setUser(s.Root.Path)
                        }
                        if c.Snapshotter == "" {
                                return errors.New("no snapshotter set for container")
                        }
                        if c.SnapshotKey == "" {
                                return errors.New("rootfs snapshot not created for container")
                        }
                        snapshotter := client.SnapshotService(c.Snapshotter)
                        mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
                        if err != nil {
                                return err
                        }

                        // Use a read-only mount when trying to get user/group information
                        // from the container's rootfs. Since the option does read operation
                        // only, we append ReadOnly mount option to prevent the Linux kernel
                        // from syncing whole filesystem in umount syscall.
                        return mount.WithReadonlyTempMount(ctx, mounts, setUser)
                } else if s.Windows != nil {
                        s.Process.User.Username = username
                } else {
                        return errors.New("spec does not contain Linux or Windows section")
                }
                return nil
        }
}

// WithAdditionalGIDs sets the OCI spec's additionalGids array to any additional groups listed
// for a particular user in the /etc/group file of the image's root filesystem
// The passed in user can be either a uid or a username.
func WithAdditionalGIDs(userstr string) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) (err error) {
                // For LCOW or on Darwin additional GID's not supported
                if s.Windows != nil || runtime.GOOS == "darwin" {
                        return nil
                }
                setProcess(s)
                s.Process.User.AdditionalGids = nil
                setAdditionalGids := func(root string) error {
                        defer ensureAdditionalGids(s)
                        var username string
                        uid, err := strconv.Atoi(userstr)
                        if err == nil {
                                user, err := UserFromPath(root, func(u user.User) bool {
                                        return u.Uid == uid
                                })
                                if err != nil {
                                        if os.IsNotExist(err) || err == ErrNoUsersFound {
                                                return nil
                                        }
                                        return err
                                }
                                username = user.Name
                        } else {
                                username = userstr
                        }
                        gids, err := getSupplementalGroupsFromPath(root, func(g user.Group) bool {
                                // we only want supplemental groups
                                if g.Name == username {
                                        return false
                                }
                                for _, entry := range g.List {
                                        if entry == username {
                                                return true
                                        }
                                }
                                return false
                        })
                        if err != nil {
                                if os.IsNotExist(err) {
                                        return nil
                                }
                                return err
                        }
                        s.Process.User.AdditionalGids = gids
                        return nil
                }
                if c.Snapshotter == "" && c.SnapshotKey == "" {
                        if !isRootfsAbs(s.Root.Path) {
                                return errors.New("rootfs absolute path is required")
                        }
                        return setAdditionalGids(s.Root.Path)
                }
                if c.Snapshotter == "" {
                        return errors.New("no snapshotter set for container")
                }
                if c.SnapshotKey == "" {
                        return errors.New("rootfs snapshot not created for container")
                }
                snapshotter := client.SnapshotService(c.Snapshotter)
                mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
                if err != nil {
                        return err
                }

                // Use a read-only mount when trying to get user/group information
                // from the container's rootfs. Since the option does read operation
                // only, we append ReadOnly mount option to prevent the Linux kernel
                // from syncing whole filesystem in umount syscall.
                return mount.WithReadonlyTempMount(ctx, mounts, setAdditionalGids)
        }
}

// WithAppendAdditionalGroups append additional groups within the container.
// The passed in groups can be either a gid or a groupname.
func WithAppendAdditionalGroups(groups ...string) SpecOpts {
        return func(ctx context.Context, client Client, c *containers.Container, s *Spec) (err error) {
                // For LCOW or on Darwin additional GID's are not supported
                if s.Windows != nil || runtime.GOOS == "darwin" {
                        return nil
                }
                setProcess(s)
                setAdditionalGids := func(root string) error {
                        defer ensureAdditionalGids(s)
                        gpath, err := fs.RootPath(root, "/etc/group")
                        if err != nil {
                                return err
                        }
                        ugroups, groupErr := user.ParseGroupFile(gpath)
                        if groupErr != nil && !os.IsNotExist(groupErr) {
                                return groupErr
                        }
                        groupMap := make(map[string]user.Group)
                        for _, group := range ugroups {
                                groupMap[group.Name] = group
                        }
                        var gids []uint32
                        for _, group := range groups {
                                gid, err := strconv.ParseUint(group, 10, 32)
                                if err == nil {
                                        gids = append(gids, uint32(gid))
                                } else {
                                        g, ok := groupMap[group]
                                        if !ok {
                                                if groupErr != nil {
                                                        return fmt.Errorf("unable to find group %s: %w", group, groupErr)
                                                }
                                                return fmt.Errorf("unable to find group %s", group)
                                        }
                                        gids = append(gids, uint32(g.Gid))
                                }
                        }
                        s.Process.User.AdditionalGids = append(s.Process.User.AdditionalGids, gids...)
                        return nil
                }
                if c.Snapshotter == "" && c.SnapshotKey == "" {
                        if !filepath.IsAbs(s.Root.Path) {
                                return errors.New("rootfs absolute path is required")
                        }
                        return setAdditionalGids(s.Root.Path)
                }
                if c.Snapshotter == "" {
                        return errors.New("no snapshotter set for container")
                }
                if c.SnapshotKey == "" {
                        return errors.New("rootfs snapshot not created for container")
                }
                snapshotter := client.SnapshotService(c.Snapshotter)
                mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
                if err != nil {
                        return err
                }

                // Use a read-only mount when trying to get user/group information
                // from the container's rootfs. Since the option does read operation
                // only, we append ReadOnly mount option to prevent the Linux kernel
                // from syncing whole filesystem in umount syscall.
                return mount.WithReadonlyTempMount(ctx, mounts, setAdditionalGids)
        }
}

// WithCapabilities sets Linux capabilities on the process
func WithCapabilities(caps []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCapabilities(s)

                s.Process.Capabilities.Bounding = caps
                s.Process.Capabilities.Effective = caps
                s.Process.Capabilities.Permitted = caps
                if len(caps) == 0 {
                        s.Process.Capabilities.Inheritable = nil
                } else if len(s.Process.Capabilities.Inheritable) > 0 {
                        filterCaps(&s.Process.Capabilities.Inheritable, caps)
                }

                return nil
        }
}

func capsContain(caps []string, s string) bool {
        for _, c := range caps {
                if c == s {
                        return true
                }
        }
        return false
}

func removeCap(caps *[]string, s string) {
        var newcaps []string
        for _, c := range *caps {
                if c == s {
                        continue
                }
                newcaps = append(newcaps, c)
        }
        *caps = newcaps
}

func filterCaps(caps *[]string, filters []string) {
        var newcaps []string
        for _, c := range *caps {
                if capsContain(filters, c) {
                        newcaps = append(newcaps, c)
                }
        }
        *caps = newcaps
}

// WithAddedCapabilities adds the provided capabilities
func WithAddedCapabilities(caps []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCapabilities(s)
                for _, c := range caps {
                        for _, cl := range []*[]string{
                                &s.Process.Capabilities.Bounding,
                                &s.Process.Capabilities.Effective,
                                &s.Process.Capabilities.Permitted,
                        } {
                                if !capsContain(*cl, c) {
                                        *cl = append(*cl, c)
                                }
                        }
                }
                return nil
        }
}

// WithDroppedCapabilities removes the provided capabilities
func WithDroppedCapabilities(caps []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCapabilities(s)
                for _, c := range caps {
                        for _, cl := range []*[]string{
                                &s.Process.Capabilities.Bounding,
                                &s.Process.Capabilities.Effective,
                                &s.Process.Capabilities.Permitted,
                                &s.Process.Capabilities.Inheritable,
                        } {
                                removeCap(cl, c)
                        }
                }
                return nil
        }
}

// WithAmbientCapabilities set the Linux ambient capabilities for the process
// Ambient capabilities should only be set for non-root users or the caller should
// understand how these capabilities are used and set
func WithAmbientCapabilities(caps []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCapabilities(s)
                s.Process.Capabilities.Inheritable = caps
                s.Process.Capabilities.Ambient = caps
                return nil
        }
}

// ErrNoUsersFound can be returned from UserFromPath
var ErrNoUsersFound = errors.New("no users found")

// UserFromPath inspects the user object using /etc/passwd in the specified rootfs.
// filter can be nil.
func UserFromPath(root string, filter func(user.User) bool) (user.User, error) {
        ppath, err := fs.RootPath(root, "/etc/passwd")
        if err != nil {
                return user.User{}, err
        }
        users, err := user.ParsePasswdFileFilter(ppath, filter)
        if err != nil {
                return user.User{}, err
        }
        if len(users) == 0 {
                return user.User{}, ErrNoUsersFound
        }
        return users[0], nil
}

// ErrNoGroupsFound can be returned from GIDFromPath
var ErrNoGroupsFound = errors.New("no groups found")

// GIDFromPath inspects the GID using /etc/group in the specified rootfs.
// filter can be nil.
func GIDFromPath(root string, filter func(user.Group) bool) (gid uint32, err error) {
        gpath, err := fs.RootPath(root, "/etc/group")
        if err != nil {
                return 0, err
        }
        groups, err := user.ParseGroupFileFilter(gpath, filter)
        if err != nil {
                return 0, err
        }
        if len(groups) == 0 {
                return 0, ErrNoGroupsFound
        }
        g := groups[0]
        return uint32(g.Gid), nil
}

func getSupplementalGroupsFromPath(root string, filter func(user.Group) bool) ([]uint32, error) {
        gpath, err := fs.RootPath(root, "/etc/group")
        if err != nil {
                return []uint32{}, err
        }
        groups, err := user.ParseGroupFileFilter(gpath, filter)
        if err != nil {
                return []uint32{}, err
        }
        if len(groups) == 0 {
                // if there are no additional groups; just return an empty set
                return []uint32{}, nil
        }
        addlGids := []uint32{}
        for _, grp := range groups {
                addlGids = append(addlGids, uint32(grp.Gid))
        }
        return addlGids, nil
}

func isRootfsAbs(root string) bool {
        return filepath.IsAbs(root)
}

// WithMaskedPaths sets the masked paths option
func WithMaskedPaths(paths []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                s.Linux.MaskedPaths = paths
                return nil
        }
}

// WithReadonlyPaths sets the read only paths option
func WithReadonlyPaths(paths []string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                s.Linux.ReadonlyPaths = paths
                return nil
        }
}

// WithWriteableSysfs makes any sysfs mounts writeable
func WithWriteableSysfs(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        for _, m := range s.Mounts {
                if m.Type == "sysfs" {
                        for i, o := range m.Options {
                                if o == "ro" {
                                        m.Options[i] = "rw"
                                }
                        }
                }
        }
        return nil
}

// WithWriteableCgroupfs makes any cgroup mounts writeable
func WithWriteableCgroupfs(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        for _, m := range s.Mounts {
                if m.Type == "cgroup" {
                        for i, o := range m.Options {
                                if o == "ro" {
                                        m.Options[i] = "rw"
                                }
                        }
                }
        }
        return nil
}

// WithSelinuxLabel sets the process SELinux label
func WithSelinuxLabel(label string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setProcess(s)
                s.Process.SelinuxLabel = label
                return nil
        }
}

// WithApparmorProfile sets the Apparmor profile for the process
func WithApparmorProfile(profile string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setProcess(s)
                s.Process.ApparmorProfile = profile
                return nil
        }
}

// WithSeccompUnconfined clears the seccomp profile
func WithSeccompUnconfined(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setLinux(s)
        s.Linux.Seccomp = nil
        return nil
}

// WithParentCgroupDevices uses the default cgroup setup to inherit the container's parent cgroup's
// allowed and denied devices
func WithParentCgroupDevices(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setLinux(s)
        if s.Linux.Resources == nil {
                s.Linux.Resources = &specs.LinuxResources{}
        }
        s.Linux.Resources.Devices = nil
        return nil
}

// WithAllDevicesAllowed permits READ WRITE MKNOD on all devices nodes for the container
func WithAllDevicesAllowed(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setLinux(s)
        if s.Linux.Resources == nil {
                s.Linux.Resources = &specs.LinuxResources{}
        }
        s.Linux.Resources.Devices = []specs.LinuxDeviceCgroup{
                {
                        Allow:  true,
                        Access: rwm,
                },
        }
        return nil
}

// WithDefaultUnixDevices adds the default devices for unix such as /dev/null, /dev/random to
// the container's resource cgroup spec
func WithDefaultUnixDevices(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setLinux(s)
        if s.Linux.Resources == nil {
                s.Linux.Resources = &specs.LinuxResources{}
        }
        intptr := func(i int64) *int64 {
                return &i
        }
        s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, []specs.LinuxDeviceCgroup{
                {
                        // "/dev/null",
                        Type:   "c",
                        Major:  intptr(1),
                        Minor:  intptr(3),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "/dev/random",
                        Type:   "c",
                        Major:  intptr(1),
                        Minor:  intptr(8),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "/dev/full",
                        Type:   "c",
                        Major:  intptr(1),
                        Minor:  intptr(7),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "/dev/tty",
                        Type:   "c",
                        Major:  intptr(5),
                        Minor:  intptr(0),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "/dev/zero",
                        Type:   "c",
                        Major:  intptr(1),
                        Minor:  intptr(5),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "/dev/urandom",
                        Type:   "c",
                        Major:  intptr(1),
                        Minor:  intptr(9),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "/dev/console",
                        Type:   "c",
                        Major:  intptr(5),
                        Minor:  intptr(1),
                        Access: rwm,
                        Allow:  true,
                },
                // /dev/pts/ - pts namespaces are "coming soon"
                {
                        Type:   "c",
                        Major:  intptr(136),
                        Access: rwm,
                        Allow:  true,
                },
                {
                        // "dev/ptmx"
                        Type:   "c",
                        Major:  intptr(5),
                        Minor:  intptr(2),
                        Access: rwm,
                        Allow:  true,
                },
        }...)
        return nil
}

// WithPrivileged sets up options for a privileged container
var WithPrivileged = Compose(
        WithAllCurrentCapabilities,
        WithMaskedPaths(nil),
        WithReadonlyPaths(nil),
        WithWriteableSysfs,
        WithWriteableCgroupfs,
        WithSelinuxLabel(""),
        WithApparmorProfile(""),
        WithSeccompUnconfined,
)

// WithWindowsHyperV sets the Windows.HyperV section for HyperV isolation of containers.
func WithWindowsHyperV(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        if s.Windows == nil {
                s.Windows = &specs.Windows{}
        }
        if s.Windows.HyperV == nil {
                s.Windows.HyperV = &specs.WindowsHyperV{}
        }
        return nil
}

// WithMemoryLimit sets the `Linux.LinuxResources.Memory.Limit` section to the
// `limit` specified if the `Linux` section is not `nil`. Additionally sets the
// `Windows.WindowsResources.Memory.Limit` section if the `Windows` section is
// not `nil`.
func WithMemoryLimit(limit uint64) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if s.Linux != nil {
                        if s.Linux.Resources == nil {
                                s.Linux.Resources = &specs.LinuxResources{}
                        }
                        if s.Linux.Resources.Memory == nil {
                                s.Linux.Resources.Memory = &specs.LinuxMemory{}
                        }
                        l := int64(limit)
                        s.Linux.Resources.Memory.Limit = &l
                }
                if s.Windows != nil {
                        if s.Windows.Resources == nil {
                                s.Windows.Resources = &specs.WindowsResources{}
                        }
                        if s.Windows.Resources.Memory == nil {
                                s.Windows.Resources.Memory = &specs.WindowsMemoryResources{}
                        }
                        s.Windows.Resources.Memory.Limit = &limit
                }
                return nil
        }
}

// WithAnnotations appends or replaces the annotations on the spec with the
// provided annotations
func WithAnnotations(annotations map[string]string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if s.Annotations == nil {
                        s.Annotations = make(map[string]string)
                }
                for k, v := range annotations {
                        s.Annotations[k] = v
                }
                return nil
        }
}

// WithLinuxDevices adds the provided linux devices to the spec
func WithLinuxDevices(devices []specs.LinuxDevice) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                s.Linux.Devices = append(s.Linux.Devices, devices...)
                return nil
        }
}

func WithLinuxDeviceFollowSymlinks(path, permissions string) SpecOpts {
        return withLinuxDevice(path, permissions, true)
}

// WithLinuxDevice adds the device specified by path to the spec
func WithLinuxDevice(path, permissions string) SpecOpts {
        return withLinuxDevice(path, permissions, false)
}

func withLinuxDevice(path, permissions string, followSymlinks bool) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setLinux(s)
                setResources(s)

                if followSymlinks {
                        resolvedPath, err := filepath.EvalSymlinks(path)
                        if err != nil {
                                return err
                        }
                        path = resolvedPath
                }

                dev, err := DeviceFromPath(path)
                if err != nil {
                        return err
                }

                s.Linux.Devices = append(s.Linux.Devices, *dev)

                s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, specs.LinuxDeviceCgroup{
                        Type:   dev.Type,
                        Allow:  true,
                        Major:  &dev.Major,
                        Minor:  &dev.Minor,
                        Access: permissions,
                })

                return nil
        }
}

// WithEnvFile adds environment variables from a file to the container's spec
func WithEnvFile(path string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                var vars []string
                f, err := os.Open(path)
                if err != nil {
                        return err
                }
                defer f.Close()

                sc := bufio.NewScanner(f)
                for sc.Scan() {
                        vars = append(vars, sc.Text())
                }
                if err = sc.Err(); err != nil {
                        return err
                }
                return WithEnv(vars)(nil, nil, nil, s)
        }
}

// ErrNoShmMount is returned when there is no /dev/shm mount specified in the config
// and an Opts was trying to set a configuration value on the mount.
var ErrNoShmMount = errors.New("no /dev/shm mount specified")

// WithDevShmSize sets the size of the /dev/shm mount for the container.
//
// The size value is specified in kb, kilobytes.
func WithDevShmSize(kb int64) SpecOpts {
        return func(ctx context.Context, _ Client, _ *containers.Container, s *Spec) error {
                for i, m := range s.Mounts {
                        if filepath.Clean(m.Destination) == "/dev/shm" && m.Source == "shm" && m.Type == "tmpfs" {
                                for i := 0; i < len(m.Options); i++ {
                                        if strings.HasPrefix(m.Options[i], "size=") {
                                                m.Options = append(m.Options[:i], m.Options[i+1:]...)
                                                i--
                                        }
                                }
                                s.Mounts[i].Options = append(m.Options, fmt.Sprintf("size=%dk", kb))
                                return nil
                        }
                }
                return ErrNoShmMount
        }
}

// WithWindowsDevice adds a device exposed to a Windows (WCOW or LCOW) Container
func WithWindowsDevice(idType, id string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if idType == "" {
                        return errors.New("missing idType")
                }
                if s.Windows == nil {
                        s.Windows = &specs.Windows{}
                }
                s.Windows.Devices = append(s.Windows.Devices, specs.WindowsDevice{IDType: idType, ID: id})
                return nil
        }
}

// WithMemorySwap sets the container's swap in bytes
func WithMemorySwap(swap int64) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setResources(s)
                if s.Linux.Resources.Memory == nil {
                        s.Linux.Resources.Memory = &specs.LinuxMemory{}
                }
                s.Linux.Resources.Memory.Swap = &swap
                return nil
        }
}

// WithPidsLimit sets the container's pid limit or maximum
func WithPidsLimit(limit int64) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setResources(s)
                if s.Linux.Resources.Pids == nil {
                        s.Linux.Resources.Pids = &specs.LinuxPids{}
                }
                s.Linux.Resources.Pids.Limit = limit
                return nil
        }
}

// WithBlockIO sets the container's blkio parameters
func WithBlockIO(blockio *specs.LinuxBlockIO) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setResources(s)
                s.Linux.Resources.BlockIO = blockio
                return nil
        }
}

// WithCPUShares sets the container's cpu shares
func WithCPUShares(shares uint64) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setCPU(s)
                s.Linux.Resources.CPU.Shares = &shares
                return nil
        }
}

// WithCPUs sets the container's cpus/cores for use by the container
func WithCPUs(cpus string) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setCPU(s)
                s.Linux.Resources.CPU.Cpus = cpus
                return nil
        }
}

// WithCPUsMems sets the container's cpu mems for use by the container
func WithCPUsMems(mems string) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setCPU(s)
                s.Linux.Resources.CPU.Mems = mems
                return nil
        }
}

// WithCPUCFS sets the container's Completely fair scheduling (CFS) quota and period
func WithCPUCFS(quota int64, period uint64) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setCPU(s)
                s.Linux.Resources.CPU.Quota = &quota
                s.Linux.Resources.CPU.Period = &period
                return nil
        }
}

// WithCPUBurst sets the container's cpu burst
func WithCPUBurst(burst uint64) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setCPU(s)
                s.Linux.Resources.CPU.Burst = &burst
                return nil
        }
}

// WithCPURT sets the container's realtime scheduling (RT) runtime and period.
func WithCPURT(runtime int64, period uint64) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                setCPU(s)
                s.Linux.Resources.CPU.RealtimeRuntime = &runtime
                s.Linux.Resources.CPU.RealtimePeriod = &period
                return nil
        }
}

// WithoutRunMount removes the `/run` inside the spec
func WithoutRunMount(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
        return WithoutMounts("/run")(ctx, client, c, s)
}

// WithRdt sets the container's RDT parameters
func WithRdt(closID, l3CacheSchema, memBwSchema string) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                s.Linux.IntelRdt = &specs.LinuxIntelRdt{
                        ClosID:        closID,
                        L3CacheSchema: l3CacheSchema,
                        MemBwSchema:   memBwSchema,
                }
                return nil
        }
}

// WithWindowsCPUCount sets the `Windows.Resources.CPU.Count` section to the
// `count` specified.
func WithWindowsCPUCount(count uint64) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCPUWindows(s)
                s.Windows.Resources.CPU.Count = &count
                return nil
        }
}

// WithWindowsCPUShares sets the `Windows.Resources.CPU.Shares` section to the
// `shares` specified.
func WithWindowsCPUShares(shares uint16) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCPUWindows(s)
                s.Windows.Resources.CPU.Shares = &shares
                return nil
        }
}

// WithWindowsCPUMaximum sets the `Windows.Resources.CPU.Maximum` section to the
// `max` specified.
func WithWindowsCPUMaximum(max uint16) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                setCPUWindows(s)
                s.Windows.Resources.CPU.Maximum = &max
                return nil
        }
}

// WithWindowsIgnoreFlushesDuringBoot sets `Windows.IgnoreFlushesDuringBoot`.
func WithWindowsIgnoreFlushesDuringBoot() SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if s.Windows == nil {
                        s.Windows = &specs.Windows{}
                }
                s.Windows.IgnoreFlushesDuringBoot = true
                return nil
        }
}

// WithWindowNetworksAllowUnqualifiedDNSQuery sets `Windows.Network.AllowUnqualifiedDNSQuery`.
func WithWindowNetworksAllowUnqualifiedDNSQuery() SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if s.Windows == nil {
                        s.Windows = &specs.Windows{}
                }
                if s.Windows.Network == nil {
                        s.Windows.Network = &specs.WindowsNetwork{}
                }

                s.Windows.Network.AllowUnqualifiedDNSQuery = true
                return nil
        }
}

// WithWindowsNetworkNamespace sets the network namespace for a Windows container.
func WithWindowsNetworkNamespace(ns string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                if s.Windows == nil {
                        s.Windows = &specs.Windows{}
                }
                if s.Windows.Network == nil {
                        s.Windows.Network = &specs.WindowsNetwork{}
                }
                s.Windows.Network.NetworkNamespace = ns
                return nil
        }
}

// WithCDIDevices injects the requested CDI devices into the OCI specification.
func WithCDIDevices(devices ...string) SpecOpts {
        return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error {
                if len(devices) == 0 {
                        return nil
                }

                if err := cdi.Refresh(); err != nil {
                        // We don't consider registry refresh failure a fatal error.
                        // For instance, a dynamically generated invalid CDI Spec file for
                        // any particular vendor shouldn't prevent injection of devices of
                        // different vendors. CDI itself knows better and it will fail the
                        // injection if necessary.
                        log.G(ctx).Warnf("CDI registry refresh failed: %v", err)
                }

                if _, err := cdi.InjectDevices(s, devices...); err != nil {
                        return fmt.Errorf("CDI device injection failed: %w", err)
                }

                // One crucial thing to keep in mind is that CDI device injection
                // might add OCI Spec environment variables, hooks, and mounts as
                // well. Therefore it is important that none of the corresponding
                // OCI Spec fields are reset up in the call stack once we return.
                return nil
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package oci

import (
        "context"

        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/cap"
        specs "github.com/opencontainers/runtime-spec/specs-go"
)

// WithHostDevices adds all the hosts device nodes to the container's spec
func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        setLinux(s)

        devs, err := HostDevices()
        if err != nil {
                return err
        }
        s.Linux.Devices = append(s.Linux.Devices, devs...)
        return nil
}

// WithDevices recursively adds devices from the passed in path and associated cgroup rules for that device.
// If devicePath is a dir it traverses the dir to add all devices in that dir.
// If devicePath is not a dir, it attempts to add the single device.
// If containerPath is not set then the device path is used for the container path.
func WithDevices(devicePath, containerPath, permissions string) SpecOpts {
        return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
                devs, err := getDevices(devicePath, containerPath)
                if err != nil {
                        return err
                }
                for i := range devs {
                        s.Linux.Devices = append(s.Linux.Devices, devs[i])
                        s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, specs.LinuxDeviceCgroup{
                                Allow:  true,
                                Type:   devs[i].Type,
                                Major:  &devs[i].Major,
                                Minor:  &devs[i].Minor,
                                Access: permissions,
                        })
                }
                return nil
        }
}

// WithAllCurrentCapabilities propagates the effective capabilities of the caller process to the container process.
// The capability set may differ from WithAllKnownCapabilities when running in a container.
var WithAllCurrentCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
        caps, err := cap.Current()
        if err != nil {
                return err
        }
        return WithCapabilities(caps)(ctx, client, c, s)
}

// WithAllKnownCapabilities sets all the known linux capabilities for the container process
var WithAllKnownCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error {
        caps := cap.Known()
        return WithCapabilities(caps)(ctx, client, c, s)
}

func escapeAndCombineArgs(args []string) string {
        panic("not supported")
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package oci

import (
        "context"

        "github.com/containerd/containerd/v2/core/containers"
)

// WithDefaultPathEnv sets the $PATH environment variable to the
// default PATH defined in this package.
func WithDefaultPathEnv(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
        s.Process.Env = replaceOrAppendEnvValues(s.Process.Env, defaultUnixEnv)
        return nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package oci

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"

        "github.com/containerd/containerd/v2/pkg/userns"
        specs "github.com/opencontainers/runtime-spec/specs-go"
        "golang.org/x/sys/unix"
)

// ErrNotADevice denotes that a file is not a valid linux device.
var ErrNotADevice = errors.New("not a device node")

// Testing dependencies
var (
        osReadDir              = os.ReadDir
        usernsRunningInUserNS  = userns.RunningInUserNS
        overrideDeviceFromPath func(path string) error
)

// HostDevices returns all devices that can be found under /dev directory.
func HostDevices() ([]specs.LinuxDevice, error) {
        return getDevices("/dev", "")
}

func getDevices(path, containerPath string) ([]specs.LinuxDevice, error) {
        stat, err := os.Stat(path)
        if err != nil {
                return nil, fmt.Errorf("error stating device path: %w", err)
        }

        if !stat.IsDir() {
                dev, err := DeviceFromPath(path)
                if err != nil {
                        return nil, err
                }
                if containerPath != "" {
                        dev.Path = containerPath
                }
                return []specs.LinuxDevice{*dev}, nil
        }

        files, err := osReadDir(path)
        if err != nil {
                return nil, err
        }
        var out []specs.LinuxDevice
        for _, f := range files {
                switch {
                case f.IsDir():
                        switch f.Name() {
                        // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
                        // ".udev" added to address https://github.com/opencontainers/runc/issues/2093
                        case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev":
                                continue
                        default:
                                var cp string
                                if containerPath != "" {
                                        cp = filepath.Join(containerPath, filepath.Base(f.Name()))
                                }
                                sub, err := getDevices(filepath.Join(path, f.Name()), cp)
                                if err != nil {
                                        if errors.Is(err, os.ErrPermission) && usernsRunningInUserNS() {
                                                // ignore the "permission denied" error if running in userns.
                                                // This allows rootless containers to use devices that are
                                                // accessible, ignoring devices / subdirectories that are not.
                                                continue
                                        }
                                        return nil, err
                                }

                                out = append(out, sub...)
                                continue
                        }
                case f.Name() == "console":
                        continue
                default:
                        device, err := DeviceFromPath(filepath.Join(path, f.Name()))
                        if err != nil {
                                if err == ErrNotADevice {
                                        continue
                                }
                                if os.IsNotExist(err) {
                                        continue
                                }
                                if errors.Is(err, os.ErrPermission) && usernsRunningInUserNS() {
                                        // ignore the "permission denied" error if running in userns.
                                        // This allows rootless containers to use devices that are
                                        // accessible, ignoring devices that are not.
                                        continue
                                }
                                return nil, err
                        }
                        if device.Type == fifoDevice {
                                continue
                        }
                        if containerPath != "" {
                                device.Path = filepath.Join(containerPath, filepath.Base(f.Name()))
                        }
                        out = append(out, *device)
                }
        }
        return out, nil
}

// TODO consider adding these consts to the OCI runtime-spec.
const (
        wildcardDevice = "a" //nolint:nolintlint,unused,varcheck // currently unused, but should be included when upstreaming to OCI runtime-spec.
        blockDevice    = "b"
        charDevice     = "c" // or "u"
        fifoDevice     = "p"
)

// DeviceFromPath takes the path to a device to look up the information about a
// linux device and returns that information as a LinuxDevice struct.
func DeviceFromPath(path string) (*specs.LinuxDevice, error) {
        if overrideDeviceFromPath != nil {
                if err := overrideDeviceFromPath(path); err != nil {
                        return nil, err
                }
        }

        var stat unix.Stat_t
        if err := unix.Lstat(path, &stat); err != nil {
                return nil, err
        }

        var (
                devNumber = uint64(stat.Rdev) //nolint:nolintlint,unconvert // the type is 32bit on mips.
                major     = unix.Major(devNumber)
                minor     = unix.Minor(devNumber)
        )

        var (
                devType string
                mode    = stat.Mode
        )

        switch mode & unix.S_IFMT {
        case unix.S_IFBLK:
                devType = blockDevice
        case unix.S_IFCHR:
                devType = charDevice
        case unix.S_IFIFO:
                devType = fifoDevice
        default:
                return nil, ErrNotADevice
        }
        fm := os.FileMode(mode &^ unix.S_IFMT)
        return &specs.LinuxDevice{
                Type:     devType,
                Path:     path,
                Major:    int64(major),
                Minor:    int64(minor),
                FileMode: &fm,
                UID:      &stat.Uid,
                GID:      &stat.Gid,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package os

import (
        "github.com/containerd/containerd/v2/core/mount"
        "golang.org/x/sys/unix"
)

// Mount will call unix.Mount to mount the file.
func (RealOS) Mount(source string, target string, fstype string, flags uintptr, data string) error {
        return unix.Mount(source, target, fstype, flags, data)
}

// Unmount will call Unmount to unmount the file.
func (RealOS) Unmount(target string) error {
        return mount.Unmount(target, unix.MNT_DETACH)
}

// LookupMount gets mount info of a given path.
func (RealOS) LookupMount(path string) (mount.Info, error) {
        return mount.Lookup(path)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package os

import (
        "io"
        "os"

        "github.com/moby/sys/symlink"

        "github.com/containerd/containerd/v2/core/mount"
)

// OS collects system level operations that need to be mocked out
// during tests.
type OS interface {
        MkdirAll(path string, perm os.FileMode) error
        RemoveAll(path string) error
        Stat(name string) (os.FileInfo, error)
        ResolveSymbolicLink(name string) (string, error)
        FollowSymlinkInScope(path, scope string) (string, error)
        CopyFile(src, dest string, perm os.FileMode) error
        WriteFile(filename string, data []byte, perm os.FileMode) error
        Hostname() (string, error)
        Mount(source string, target string, fstype string, flags uintptr, data string) error
        Unmount(target string) error
        LookupMount(path string) (mount.Info, error)
}

// RealOS is used to dispatch the real system level operations.
type RealOS struct{}

// MkdirAll will call os.MkdirAll to create a directory.
func (RealOS) MkdirAll(path string, perm os.FileMode) error {
        return os.MkdirAll(path, perm)
}

// RemoveAll will call os.RemoveAll to remove the path and its children.
func (RealOS) RemoveAll(path string) error {
        return os.RemoveAll(path)
}

// Stat will call os.Stat to get the status of the given file.
func (RealOS) Stat(name string) (os.FileInfo, error) {
        return os.Stat(name)
}

// FollowSymlinkInScope will call symlink.FollowSymlinkInScope.
func (RealOS) FollowSymlinkInScope(path, scope string) (string, error) {
        return symlink.FollowSymlinkInScope(path, scope)
}

// CopyFile will copy src file to dest file
func (RealOS) CopyFile(src, dest string, perm os.FileMode) error {
        in, err := os.Open(src)
        if err != nil {
                return err
        }
        defer in.Close()

        out, err := os.OpenFile(dest, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm)
        if err != nil {
                return err
        }
        defer out.Close()

        _, err = io.Copy(out, in)
        return err
}

// WriteFile will call os.WriteFile to write data into a file.
func (RealOS) WriteFile(filename string, data []byte, perm os.FileMode) error {
        return os.WriteFile(filename, data, perm)
}

// Hostname will call os.Hostname to get the hostname of the host.
func (RealOS) Hostname() (string, error) {
        return os.Hostname()
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package os

import (
        "os"
        "path/filepath"
)

// ResolveSymbolicLink will follow any symbolic links
func (RealOS) ResolveSymbolicLink(path string) (string, error) {
        info, err := os.Lstat(path)
        if err != nil {
                return "", err
        }
        if info.Mode()&os.ModeSymlink != os.ModeSymlink {
                return path, nil
        }
        return filepath.EvalSymlinks(path)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package protobuf

import (
        "github.com/containerd/typeurl/v2"
        "google.golang.org/protobuf/types/known/anypb"
)

// FromAny converts typeurl.Any to github.com/containerd/containerd/protobuf/types.Any.
func FromAny(from typeurl.Any) *anypb.Any {
        if from == nil {
                return nil
        }

        if pbany, ok := from.(*anypb.Any); ok {
                return pbany
        }

        return &anypb.Any{
                TypeUrl: from.GetTypeUrl(),
                Value:   from.GetValue(),
        }
}

// MarshalAnyToProto converts an arbitrary interface to github.com/containerd/containerd/protobuf/types.Any.
func MarshalAnyToProto(from interface{}) (*anypb.Any, error) {
        anyType, err := typeurl.MarshalAny(from)
        if err != nil {
                return nil, err
        }
        return FromAny(anyType), nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package protobuf

import (
        "github.com/google/go-cmp/cmp"
        "google.golang.org/protobuf/proto"
)

var Compare = cmp.FilterValues(
        func(x, y interface{}) bool {
                _, xok := x.(proto.Message)
                _, yok := y.(proto.Message)
                return xok && yok
        },
        cmp.Comparer(func(x, y interface{}) bool {
                vx, ok := x.(proto.Message)
                if !ok {
                        return false
                }
                vy, ok := y.(proto.Message)
                if !ok {
                        return false
                }
                return proto.Equal(vx, vy)
        }),
)

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package proto provides convinient aliases that make google.golang.org/protobuf migration easier.
package proto

import (
        google "google.golang.org/protobuf/proto"
)

func Marshal(input google.Message) ([]byte, error) {
        return google.Marshal(input)
}

func Unmarshal(input []byte, output google.Message) error {
        return google.Unmarshal(input, output)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package protobuf

import (
        "time"

        "google.golang.org/protobuf/types/known/timestamppb"
)

// Once we migrate off from gogo/protobuf, we can use the function below, which don't return any errors.
// https://github.com/protocolbuffers/protobuf-go/blob/v1.28.0/types/known/timestamppb/timestamp.pb.go#L200-L208

// ToTimestamp creates protobuf's Timestamp from time.Time.
func ToTimestamp(from time.Time) *timestamppb.Timestamp {
        return timestamppb.New(from)
}

// FromTimestamp creates time.Time from protobuf's Timestamp.
func FromTimestamp(from *timestamppb.Timestamp) time.Time {
        return from.AsTime()
}

//go:build linux && !no_rdt

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package rdt

import (
        "fmt"
        "sync"

        "github.com/containerd/log"

        "github.com/intel/goresctrl/pkg/rdt"
)

const (
        // ResctrlPrefix is the prefix used for class/closid directories under the resctrl filesystem
        ResctrlPrefix = ""
)

var (
        enabled   bool
        enabledMu sync.RWMutex
)

// IsEnabled checks whether rdt is enabled.
func IsEnabled() bool {
        enabledMu.RLock()
        defer enabledMu.RUnlock()

        return enabled
}

var (
        initOnce sync.Once
        initErr  error
)

// SetConfig updates rdt config with a given config path.
func SetConfig(configFilePath string) error {
        enabledMu.Lock()
        defer enabledMu.Unlock()

        enabled = false
        if configFilePath == "" {
                log.L.Debug("No RDT config file specified, RDT not configured")
                return nil
        }

        initOnce.Do(func() {
                err := rdt.Initialize(ResctrlPrefix)
                if err != nil {
                        initErr = fmt.Errorf("RDT not enabled: %w", err)
                }
        })
        if initErr != nil {
                return initErr
        }

        if err := rdt.SetConfigFromFile(configFilePath, true); err != nil {
                return err
        }
        enabled = true
        return nil
}

// ContainerClassFromAnnotations examines container and pod annotations of a
// container and returns its RDT class.
func ContainerClassFromAnnotations(containerName string, containerAnnotations, podAnnotations map[string]string) (string, error) {
        return rdt.ContainerClassFromAnnotations(containerName, containerAnnotations, podAnnotations)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package reference

import (
        "errors"
        "fmt"
        "net/url"
        "path"
        "regexp"
        "strings"

        digest "github.com/opencontainers/go-digest"
)

var (
        // ErrInvalid is returned when there is an invalid reference
        ErrInvalid = errors.New("invalid reference")
        // ErrObjectRequired is returned when the object is required
        ErrObjectRequired = errors.New("object required")
        // ErrHostnameRequired is returned when the hostname is required
        ErrHostnameRequired = errors.New("hostname required")
)

// Spec defines the main components of a reference specification.
//
// A reference specification is a schema-less URI parsed into common
// components. The two main components, locator and object, are required to be
// supported by remotes. It represents a superset of the naming define in
// docker's reference schema. It aims to be compatible but not prescriptive.
//
// While the interpretation of the components, locator and object, are up to
// the remote, we define a few common parts, accessible via helper methods.
//
// The first is the hostname, which is part of the locator. This doesn't need
// to map to a physical resource, but it must parse as a hostname. We refer to
// this as the namespace.
//
// The other component made accessible by helper method is the digest. This is
// part of the object identifier, always prefixed with an '@'. If present, the
// remote may use the digest portion directly or resolve it against a prefix.
// If the object does not include the `@` symbol, the return value for `Digest`
// will be empty.
type Spec struct {
        // Locator is the host and path portion of the specification. The host
        // portion may refer to an actual host or just a namespace of related
        // images.
        //
        // Typically, the locator may used to resolve the remote to fetch specific
        // resources.
        Locator string

        // Object contains the identifier for the remote resource. Classically,
        // this is a tag but can refer to anything in a remote. By convention, any
        // portion that may be a partial or whole digest will be preceded by an
        // `@`. Anything preceding the `@` will be referred to as the "tag".
        //
        // In practice, we will see this broken down into the following formats:
        //
        // 1. <tag>
        // 2. <tag>@<digest spec>
        // 3. @<digest spec>
        //
        // We define the tag to be anything except '@' and ':'. <digest spec> may
        // be a full valid digest or shortened version, possibly with elided
        // algorithm.
        Object string
}

var splitRe = regexp.MustCompile(`[:@]`)

// Parse parses the string into a structured ref.
func Parse(s string) (Spec, error) {
        if strings.Contains(s, "://") {
                return Spec{}, ErrInvalid
        }

        u, err := url.Parse("dummy://" + s)
        if err != nil {
                return Spec{}, err
        }

        if u.Scheme != "dummy" {
                return Spec{}, ErrInvalid
        }

        if u.Host == "" {
                return Spec{}, ErrHostnameRequired
        }

        var object string

        if idx := splitRe.FindStringIndex(u.Path); idx != nil {
                // This allows us to retain the @ to signify digests or shortened digests in
                // the object.
                object = u.Path[idx[0]:]
                if object[:1] == ":" {
                        object = object[1:]
                }
                u.Path = u.Path[:idx[0]]
        }

        return Spec{
                Locator: path.Join(u.Host, u.Path),
                Object:  object,
        }, nil
}

// Hostname returns the hostname portion of the locator.
//
// Remotes are not required to directly access the resources at this host. This
// method is provided for convenience.
func (r Spec) Hostname() string {
        i := strings.Index(r.Locator, "/")

        if i < 0 {
                return r.Locator
        }
        return r.Locator[:i]
}

// Digest returns the digest portion of the reference spec. This may be a
// partial or invalid digest, which may be used to lookup a complete digest.
func (r Spec) Digest() digest.Digest {
        _, dgst := SplitObject(r.Object)
        return dgst
}

// String returns the normalized string for the ref.
func (r Spec) String() string {
        if r.Object == "" {
                return r.Locator
        }
        if r.Object[:1] == "@" {
                return fmt.Sprintf("%v%v", r.Locator, r.Object)
        }

        return fmt.Sprintf("%v:%v", r.Locator, r.Object)
}

// SplitObject provides two parts of the object spec, delimited by an `@`
// symbol.
//
// Either may be empty and it is the callers job to validate them
// appropriately.
func SplitObject(obj string) (tag string, dgst digest.Digest) {
        parts := strings.SplitAfterN(obj, "@", 2)
        if len(parts) < 2 {
                return parts[0], ""
        }
        return parts[0], digest.Digest(parts[1])
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package rootfs

import (
        "context"
        "crypto/rand"
        "encoding/base64"
        "fmt"
        "time"

        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// Layer represents the descriptors for a layer diff. These descriptions
// include the descriptor for the uncompressed tar diff as well as a blob
// used to transport that tar. The blob descriptor may or may not describe
// a compressed object.
type Layer struct {
        Diff ocispec.Descriptor
        Blob ocispec.Descriptor
}

// ApplyLayers applies all the layers using the given snapshotter and applier.
// The returned result is a chain id digest representing all the applied layers.
// Layers are applied in order they are given, making the first layer the
// bottom-most layer in the layer chain.
func ApplyLayers(ctx context.Context, layers []Layer, sn snapshots.Snapshotter, a diff.Applier) (digest.Digest, error) {
        return ApplyLayersWithOpts(ctx, layers, sn, a, nil)
}

// ApplyLayersWithOpts applies all the layers using the given snapshotter, applier, and apply opts.
// The returned result is a chain id digest representing all the applied layers.
// Layers are applied in order they are given, making the first layer the
// bottom-most layer in the layer chain.
func ApplyLayersWithOpts(ctx context.Context, layers []Layer, sn snapshots.Snapshotter, a diff.Applier, applyOpts []diff.ApplyOpt) (digest.Digest, error) {
        chain := make([]digest.Digest, len(layers))
        for i, layer := range layers {
                chain[i] = layer.Diff.Digest
        }
        chainID := identity.ChainID(chain)

        // Just stat top layer, remaining layers will have their existence checked
        // on prepare. Calling prepare on upper layers first guarantees that upper
        // layers are not removed while calling stat on lower layers
        _, err := sn.Stat(ctx, chainID.String())
        if err != nil {
                if !errdefs.IsNotFound(err) {
                        return "", fmt.Errorf("failed to stat snapshot %s: %w", chainID, err)
                }

                if err := applyLayers(ctx, layers, chain, sn, a, nil, applyOpts); err != nil && !errdefs.IsAlreadyExists(err) {
                        return "", err
                }
        }

        return chainID, nil
}

// ApplyLayer applies a single layer on top of the given provided layer chain,
// using the provided snapshotter and applier. If the layer was unpacked true
// is returned, if the layer already exists false is returned.
func ApplyLayer(ctx context.Context, layer Layer, chain []digest.Digest, sn snapshots.Snapshotter, a diff.Applier, opts ...snapshots.Opt) (bool, error) {
        return ApplyLayerWithOpts(ctx, layer, chain, sn, a, opts, nil)
}

// ApplyLayerWithOpts applies a single layer on top of the given provided layer chain,
// using the provided snapshotter, applier, and apply opts. If the layer was unpacked true
// is returned, if the layer already exists false is returned.
func ApplyLayerWithOpts(ctx context.Context, layer Layer, chain []digest.Digest, sn snapshots.Snapshotter, a diff.Applier, opts []snapshots.Opt, applyOpts []diff.ApplyOpt) (bool, error) {
        var (
                chainID = identity.ChainID(append(chain, layer.Diff.Digest)).String()
                applied bool
        )
        if _, err := sn.Stat(ctx, chainID); err != nil {
                if !errdefs.IsNotFound(err) {
                        return false, fmt.Errorf("failed to stat snapshot %s: %w", chainID, err)
                }

                if err := applyLayers(ctx, []Layer{layer}, append(chain, layer.Diff.Digest), sn, a, opts, applyOpts); err != nil {
                        if !errdefs.IsAlreadyExists(err) {
                                return false, err
                        }
                } else {
                        applied = true
                }
        }
        return applied, nil

}

func applyLayers(ctx context.Context, layers []Layer, chain []digest.Digest, sn snapshots.Snapshotter, a diff.Applier, opts []snapshots.Opt, applyOpts []diff.ApplyOpt) error {
        var (
                parent  = identity.ChainID(chain[:len(chain)-1])
                chainID = identity.ChainID(chain)
                layer   = layers[len(layers)-1]
                diff    ocispec.Descriptor
                key     string
                mounts  []mount.Mount
                err     error
        )

        for {
                key = fmt.Sprintf(snapshots.UnpackKeyFormat, uniquePart(), chainID)

                // Prepare snapshot with from parent, label as root
                mounts, err = sn.Prepare(ctx, key, parent.String(), opts...)
                if err != nil {
                        if errdefs.IsNotFound(err) && len(layers) > 1 {
                                if err := applyLayers(ctx, layers[:len(layers)-1], chain[:len(chain)-1], sn, a, opts, applyOpts); err != nil {
                                        if !errdefs.IsAlreadyExists(err) {
                                                return err
                                        }
                                }
                                // Do no try applying layers again
                                layers = nil
                                continue
                        } else if errdefs.IsAlreadyExists(err) {
                                // Try a different key
                                continue
                        }

                        // Already exists should have the caller retry
                        return fmt.Errorf("failed to prepare extraction snapshot %q: %w", key, err)

                }
                break
        }
        defer func() {
                if err != nil {
                        if !errdefs.IsAlreadyExists(err) {
                                log.G(ctx).WithError(err).WithField("key", key).Infof("apply failure, attempting cleanup")
                        }

                        if rerr := sn.Remove(ctx, key); rerr != nil {
                                log.G(ctx).WithError(rerr).WithField("key", key).Warnf("extraction snapshot removal failed")
                        }
                }
        }()

        diff, err = a.Apply(ctx, layer.Blob, mounts, applyOpts...)
        if err != nil {
                err = fmt.Errorf("failed to extract layer %s: %w", layer.Diff.Digest, err)
                return err
        }
        if diff.Digest != layer.Diff.Digest {
                err = fmt.Errorf("wrong diff id calculated on extraction %q", diff.Digest)
                return err
        }

        if err = sn.Commit(ctx, chainID.String(), key, opts...); err != nil {
                err = fmt.Errorf("failed to commit snapshot %s: %w", key, err)
                return err
        }

        return nil
}

func uniquePart() string {
        t := time.Now()
        var b [3]byte
        // Ignore read failures, just decreases uniqueness
        rand.Read(b[:])
        return fmt.Sprintf("%d-%s", t.Nanosecond(), base64.URLEncoding.EncodeToString(b[:]))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package rootfs

import (
        "context"
        "fmt"

        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/internal/cleanup"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// CreateDiff creates a layer diff for the given snapshot identifier from the
// parent of the snapshot. A content ref is provided to track the progress of
// the content creation and the provided snapshotter and mount differ are used
// for calculating the diff. The descriptor for the layer diff is returned.
func CreateDiff(ctx context.Context, snapshotID string, sn snapshots.Snapshotter, d diff.Comparer, opts ...diff.Opt) (ocispec.Descriptor, error) {
        info, err := sn.Stat(ctx, snapshotID)
        if err != nil {
                return ocispec.Descriptor{}, err
        }

        lowerKey := fmt.Sprintf("%s-parent-view-%s", info.Parent, uniquePart())
        lower, err := sn.View(ctx, lowerKey, info.Parent)
        if err != nil {
                return ocispec.Descriptor{}, err
        }
        defer cleanup.Do(ctx, func(ctx context.Context) {
                sn.Remove(ctx, lowerKey)
        })

        var upper []mount.Mount
        if info.Kind == snapshots.KindActive {
                upper, err = sn.Mounts(ctx, snapshotID)
                if err != nil {
                        return ocispec.Descriptor{}, err
                }
        } else {
                upperKey := fmt.Sprintf("%s-view-%s", snapshotID, uniquePart())
                upper, err = sn.View(ctx, upperKey, snapshotID)
                if err != nil {
                        return ocispec.Descriptor{}, err
                }
                defer cleanup.Do(ctx, func(ctx context.Context) {
                        sn.Remove(ctx, upperKey)
                })
        }

        return d.Compare(ctx, lower, upper, opts...)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package rootfs

import (
        "context"
        "errors"
        "fmt"
        "os"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
)

var (
        initializers = map[string]initializerFunc{}
)

type initializerFunc func(string) error

// Mounter handles mount and unmount
type Mounter interface {
        Mount(target string, mounts ...mount.Mount) error
        Unmount(target string) error
}

// InitRootFS initializes the snapshot for use as a rootfs
func InitRootFS(ctx context.Context, name string, parent digest.Digest, readonly bool, snapshotter snapshots.Snapshotter, mounter Mounter) ([]mount.Mount, error) {
        _, err := snapshotter.Stat(ctx, name)
        if err == nil {
                return nil, errors.New("rootfs already exists")
        }
        // TODO: ensure not exist error once added to snapshot package

        parentS := parent.String()

        initName := defaultInitializer
        initFn := initializers[initName]
        if initFn != nil {
                parentS, err = createInitLayer(ctx, parentS, initName, initFn, snapshotter, mounter)
                if err != nil {
                        return nil, err
                }
        }

        if readonly {
                return snapshotter.View(ctx, name, parentS)
        }

        return snapshotter.Prepare(ctx, name, parentS)
}

func createInitLayer(ctx context.Context, parent, initName string, initFn func(string) error, snapshotter snapshots.Snapshotter, mounter Mounter) (_ string, retErr error) {
        initS := fmt.Sprintf("%s %s", parent, initName)
        if _, err := snapshotter.Stat(ctx, initS); err == nil {
                return initS, nil
        }
        // TODO: ensure not exist error once added to snapshot package

        // Create tempdir
        td, err := os.MkdirTemp(os.Getenv("XDG_RUNTIME_DIR"), "create-init-")
        if err != nil {
                return "", err
        }
        defer os.RemoveAll(td)

        mounts, err := snapshotter.Prepare(ctx, td, parent)
        if err != nil {
                return "", err
        }

        defer func() {
                if retErr != nil {
                        if rerr := snapshotter.Remove(ctx, td); rerr != nil {
                                log.G(ctx).Errorf("Failed to remove snapshot %s: %v", td, rerr)
                        }
                }
        }()

        if err = mounter.Mount(td, mounts...); err != nil {
                return "", err
        }

        if err = initFn(td); err != nil {
                if merr := mounter.Unmount(td); merr != nil {
                        log.G(ctx).Errorf("Failed to unmount %s: %v", td, merr)
                }
                return "", err
        }

        if err = mounter.Unmount(td); err != nil {
                return "", err
        }

        if err := snapshotter.Commit(ctx, initS, td); err != nil {
                return "", err
        }

        return initS, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package rootfs

import (
        "os"
        "path/filepath"
        "syscall"
)

const (
        defaultInitializer = "linux-init"
)

func init() {
        initializers[defaultInitializer] = initFS
}

func createDirectory(name string, uid, gid int) initializerFunc {
        return func(root string) error {
                dname := filepath.Join(root, name)
                st, err := os.Stat(dname)
                if err != nil && !os.IsNotExist(err) {
                        return err
                } else if err == nil {
                        if st.IsDir() {
                                stat := st.Sys().(*syscall.Stat_t)
                                if int(stat.Gid) == gid && int(stat.Uid) == uid {
                                        return nil
                                }
                        } else {
                                if err := os.Remove(dname); err != nil {
                                        return err
                                }
                                if err := os.Mkdir(dname, 0755); err != nil {
                                        return err
                                }
                        }
                } else {
                        if err := os.Mkdir(dname, 0755); err != nil {
                                return err
                        }
                }

                return os.Chown(dname, uid, gid)
        }
}

func touchFile(name string, uid, gid int) initializerFunc {
        return func(root string) error {
                fname := filepath.Join(root, name)

                st, err := os.Stat(fname)
                if err != nil && !os.IsNotExist(err) {
                        return err
                } else if err == nil {
                        stat := st.Sys().(*syscall.Stat_t)
                        if int(stat.Gid) == gid && int(stat.Uid) == uid {
                                return nil
                        }
                        return os.Chown(fname, uid, gid)
                }

                f, err := os.OpenFile(fname, os.O_CREATE, 0644)
                if err != nil {
                        return err
                }
                defer f.Close()

                return f.Chown(uid, gid)
        }
}

func symlink(oldname, newname string) initializerFunc {
        return func(root string) error {
                linkName := filepath.Join(root, newname)
                if _, err := os.Stat(linkName); err != nil && !os.IsNotExist(err) {
                        return err
                } else if err == nil {
                        return nil
                }
                return os.Symlink(oldname, linkName)
        }
}

func initFS(root string) error {
        st, err := os.Stat(root)
        if err != nil {
                return err
        }
        stat := st.Sys().(*syscall.Stat_t)
        uid := int(stat.Uid)
        gid := int(stat.Gid)

        initFuncs := []initializerFunc{
                createDirectory("/dev", uid, gid),
                createDirectory("/dev/pts", uid, gid),
                createDirectory("/dev/shm", uid, gid),
                touchFile("/dev/console", uid, gid),
                createDirectory("/proc", uid, gid),
                createDirectory("/sys", uid, gid),
                createDirectory("/etc", uid, gid),
                touchFile("/etc/resolv.conf", uid, gid),
                touchFile("/etc/hosts", uid, gid),
                touchFile("/etc/hostname", uid, gid),
                symlink("/proc/mounts", "/etc/mtab"),
        }

        for _, fn := range initFuncs {
                if err := fn(root); err != nil {
                        return err
                }
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package seccomp

// IsEnabled checks whether seccomp support is enabled. On Linux, it returns
// true if the kernel has been configured to support seccomp (kernel options
// CONFIG_SECCOMP and CONFIG_SECCOMP_FILTER are set). On non-Linux, it always
// returns false.
func IsEnabled() bool {
        return isEnabled()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

/*
   Copyright The runc Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package seccomp

import (
        "sync"

        "golang.org/x/sys/unix"
)

var (
        enabled     bool
        enabledOnce sync.Once
)

// isEnabled returns whether the kernel has been configured to support seccomp
// (including the check for CONFIG_SECCOMP_FILTER kernel option).
func isEnabled() bool {
        // Excerpts from prctl(2), section ERRORS:
        //
        // EACCES
        //      option is PR_SET_SECCOMP and arg2 is SECCOMP_MODE_FILTER, but
        //      the process does not have the CAP_SYS_ADMIN capability or has
        //      not set the no_new_privs attribute <...>.
        // <...>
        // EFAULT
        //      option is PR_SET_SECCOMP, arg2 is SECCOMP_MODE_FILTER, the
        //      system was built with CONFIG_SECCOMP_FILTER, and arg3 is an
        //      invalid address.
        // <...>
        // EINVAL
        //      option is PR_SET_SECCOMP or PR_GET_SECCOMP, and the kernel
        //      was not configured with CONFIG_SECCOMP.
        //
        // EINVAL
        //      option is PR_SET_SECCOMP, arg2 is SECCOMP_MODE_FILTER,
        //      and the kernel was not configured with CONFIG_SECCOMP_FILTER.
        // <end of quote>
        //
        // Meaning, in case these kernel options are set (this is what we check
        // for here), we will get some other error (most probably EACCES or
        // EFAULT). IOW, EINVAL means "seccomp not supported", any other error
        // means it is supported.

        enabledOnce.Do(func() {
                enabled = unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0) != unix.EINVAL
        })

        return enabled
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shim

import (
        "context"
        "sync"
        "time"

        v1 "github.com/containerd/containerd/api/services/ttrpc/events/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/ttrpcutil"
        "github.com/containerd/log"
        "github.com/containerd/ttrpc"
)

const (
        queueSize  = 2048
        maxRequeue = 5
)

type item struct {
        ev    *types.Envelope
        ctx   context.Context
        count int
}

// NewPublisher creates a new remote events publisher
func NewPublisher(address string) (*RemoteEventsPublisher, error) {
        client, err := ttrpcutil.NewClient(address)
        if err != nil {
                return nil, err
        }

        l := &RemoteEventsPublisher{
                client:  client,
                closed:  make(chan struct{}),
                requeue: make(chan *item, queueSize),
        }

        go l.processQueue()
        return l, nil
}

// RemoteEventsPublisher forwards events to a ttrpc server
type RemoteEventsPublisher struct {
        client  *ttrpcutil.Client
        closed  chan struct{}
        closer  sync.Once
        requeue chan *item
}

// Done returns a channel which closes when done
func (l *RemoteEventsPublisher) Done() <-chan struct{} {
        return l.closed
}

// Close closes the remote connection and closes the done channel
func (l *RemoteEventsPublisher) Close() (err error) {
        err = l.client.Close()
        l.closer.Do(func() {
                close(l.closed)
        })
        return err
}

func (l *RemoteEventsPublisher) processQueue() {
        for i := range l.requeue {
                if i.count > maxRequeue {
                        log.L.Errorf("evicting %s from queue because of retry count", i.ev.Topic)
                        // drop the event
                        continue
                }

                if err := l.forwardRequest(i.ctx, &v1.ForwardRequest{Envelope: i.ev}); err != nil {
                        log.L.WithError(err).Error("forward event")
                        l.queue(i)
                }
        }
}

func (l *RemoteEventsPublisher) queue(i *item) {
        go func() {
                i.count++
                // re-queue after a short delay
                time.Sleep(time.Duration(1*i.count) * time.Second)
                l.requeue <- i
        }()
}

// Publish publishes the event by forwarding it to the configured ttrpc server
func (l *RemoteEventsPublisher) Publish(ctx context.Context, topic string, event events.Event) error {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return err
        }
        evt, err := protobuf.MarshalAnyToProto(event)
        if err != nil {
                return err
        }
        i := &item{
                ev: &types.Envelope{
                        Timestamp: protobuf.ToTimestamp(time.Now()),
                        Namespace: ns,
                        Topic:     topic,
                        Event:     evt,
                },
                ctx: ctx,
        }

        if err := l.forwardRequest(i.ctx, &v1.ForwardRequest{Envelope: i.ev}); err != nil {
                l.queue(i)
                return err
        }

        return nil
}

func (l *RemoteEventsPublisher) forwardRequest(ctx context.Context, req *v1.ForwardRequest) error {
        service, err := l.client.EventsService()
        if err == nil {
                fCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
                _, err = service.Forward(fCtx, req)
                cancel()
                if err == nil {
                        return nil
                }
        }

        if err != ttrpc.ErrClosed {
                return err
        }

        // Reconnect and retry request
        if err = l.client.Reconnect(); err != nil {
                return err
        }

        service, err = l.client.EventsService()
        if err != nil {
                return err
        }

        // try again with a fresh context, otherwise we may get a context timeout unexpectedly.
        fCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
        _, err = service.Forward(fCtx, req)
        cancel()
        if err != nil {
                return err
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shim

import (
        "context"
        "encoding/json"
        "errors"
        "flag"
        "fmt"
        "io"
        "net"
        "os"
        "path/filepath"
        "runtime"
        "runtime/debug"
        "time"

        shimapi "github.com/containerd/containerd/api/runtime/task/v3"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/shutdown"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/ttrpc"
        "github.com/sirupsen/logrus"
)

// Publisher for events
type Publisher interface {
        events.Publisher
        io.Closer
}

// StartOpts describes shim start configuration received from containerd
type StartOpts struct {
        Address      string
        TTRPCAddress string
        Debug        bool
}

// BootstrapParams is a JSON payload returned in stdout from shim.Start call.
type BootstrapParams struct {
        // Version is the version of shim parameters (expected 2 for shim v2)
        Version int `json:"version"`
        // Address is a address containerd should use to connect to shim.
        Address string `json:"address"`
        // Protocol is either TTRPC or GRPC.
        Protocol string `json:"protocol"`
}

type StopStatus struct {
        Pid        int
        ExitStatus int
        ExitedAt   time.Time
}

// Manager is the interface which manages the shim process
type Manager interface {
        Name() string
        Start(ctx context.Context, id string, opts StartOpts) (BootstrapParams, error)
        Stop(ctx context.Context, id string) (StopStatus, error)
        Info(ctx context.Context, optionsR io.Reader) (*types.RuntimeInfo, error)
}

// OptsKey is the context key for the Opts value.
type OptsKey struct{}

// Opts are context options associated with the shim invocation.
type Opts struct {
        BundlePath string
        Debug      bool
}

// BinaryOpts allows the configuration of a shims binary setup
type BinaryOpts func(*Config)

// Config of shim binary options provided by shim implementations
type Config struct {
        // NoSubreaper disables setting the shim as a child subreaper
        NoSubreaper bool
        // NoReaper disables the shim binary from reaping any child process implicitly
        NoReaper bool
        // NoSetupLogger disables automatic configuration of logrus to use the shim FIFO
        NoSetupLogger bool
}

type TTRPCService interface {
        RegisterTTRPC(*ttrpc.Server) error
}

type TTRPCServerOptioner interface {
        TTRPCService

        UnaryInterceptor() ttrpc.UnaryServerInterceptor
}

var (
        debugFlag            bool
        versionFlag          bool
        infoFlag             bool
        id                   string
        namespaceFlag        string
        socketFlag           string
        bundlePath           string
        addressFlag          string
        containerdBinaryFlag string
        action               string
)

const (
        ttrpcAddressEnv = "TTRPC_ADDRESS"
        grpcAddressEnv  = "GRPC_ADDRESS"
        namespaceEnv    = "NAMESPACE"
        maxVersionEnv   = "MAX_SHIM_VERSION"
)

func parseFlags() {
        flag.BoolVar(&debugFlag, "debug", false, "enable debug output in logs")
        flag.BoolVar(&versionFlag, "v", false, "show the shim version and exit")
        // "info" is not a subcommand, because old shims produce very confusing errors for unknown subcommands
        // https://github.com/containerd/containerd/pull/8509#discussion_r1210021403
        flag.BoolVar(&infoFlag, "info", false, "get the option protobuf from stdin, print the shim info protobuf to stdout, and exit")
        flag.StringVar(&namespaceFlag, "namespace", "", "namespace that owns the shim")
        flag.StringVar(&id, "id", "", "id of the task")
        flag.StringVar(&socketFlag, "socket", "", "socket path to serve")
        flag.StringVar(&bundlePath, "bundle", "", "path to the bundle if not workdir")

        flag.StringVar(&addressFlag, "address", "", "grpc address back to main containerd")
        flag.StringVar(&containerdBinaryFlag, "publish-binary", "",
                fmt.Sprintf("path to publish binary (used for publishing events), but %s will ignore this flag, please use the %s env", os.Args[0], ttrpcAddressEnv),
        )

        flag.Parse()
        action = flag.Arg(0)
}

func setRuntime() {
        debug.SetGCPercent(40)
        go func() {
                for range time.Tick(30 * time.Second) {
                        debug.FreeOSMemory()
                }
        }()
        if os.Getenv("GOMAXPROCS") == "" {
                // If GOMAXPROCS hasn't been set, we default to a value of 2 to reduce
                // the number of Go stacks present in the shim.
                runtime.GOMAXPROCS(2)
        }
}

func setLogger(ctx context.Context, id string) (context.Context, error) {
        l := log.G(ctx)
        l.Logger.SetFormatter(&logrus.TextFormatter{
                TimestampFormat: log.RFC3339NanoFixed,
                FullTimestamp:   true,
        })
        if debugFlag {
                l.Logger.SetLevel(log.DebugLevel)
        }
        f, err := openLog(ctx, id)
        if err != nil {
                return ctx, err
        }
        l.Logger.SetOutput(f)
        return log.WithLogger(ctx, l), nil
}

// Run initializes and runs a shim server.
func Run(ctx context.Context, manager Manager, opts ...BinaryOpts) {
        var config Config
        for _, o := range opts {
                o(&config)
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithField("runtime", manager.Name()))

        if err := run(ctx, manager, config); err != nil {
                fmt.Fprintf(os.Stderr, "%s: %s", manager.Name(), err)
                os.Exit(1)
        }
}

func runInfo(ctx context.Context, manager Manager) error {
        info, err := manager.Info(ctx, os.Stdin)
        if err != nil {
                return err
        }
        infoB, err := proto.Marshal(info)
        if err != nil {
                return err
        }
        _, err = os.Stdout.Write(infoB)
        return err
}

func run(ctx context.Context, manager Manager, config Config) error {
        parseFlags()
        if versionFlag {
                fmt.Printf("%s:\n", filepath.Base(os.Args[0]))
                fmt.Println("  Version: ", version.Version)
                fmt.Println("  Revision:", version.Revision)
                fmt.Println("  Go version:", version.GoVersion)
                fmt.Println("")
                return nil
        }

        if infoFlag {
                return runInfo(ctx, manager)
        }

        if namespaceFlag == "" {
                return fmt.Errorf("shim namespace cannot be empty")
        }

        setRuntime()

        signals, err := setupSignals(config)
        if err != nil {
                return err
        }

        if !config.NoSubreaper {
                if err := subreaper(); err != nil {
                        return err
                }
        }

        ttrpcAddress := os.Getenv(ttrpcAddressEnv)
        publisher, err := NewPublisher(ttrpcAddress)
        if err != nil {
                return err
        }
        defer publisher.Close()

        ctx = namespaces.WithNamespace(ctx, namespaceFlag)
        ctx = context.WithValue(ctx, OptsKey{}, Opts{BundlePath: bundlePath, Debug: debugFlag})
        ctx, sd := shutdown.WithShutdown(ctx)
        defer sd.Shutdown()

        // Handle explicit actions
        switch action {
        case "delete":
                logger := log.G(ctx).WithFields(log.Fields{
                        "pid":       os.Getpid(),
                        "namespace": namespaceFlag,
                })
                if debugFlag {
                        logger.Logger.SetLevel(log.DebugLevel)
                }
                go reap(ctx, logger, signals)
                ss, err := manager.Stop(ctx, id)
                if err != nil {
                        return err
                }
                data, err := proto.Marshal(&shimapi.DeleteResponse{
                        Pid:        uint32(ss.Pid),
                        ExitStatus: uint32(ss.ExitStatus),
                        ExitedAt:   protobuf.ToTimestamp(ss.ExitedAt),
                })
                if err != nil {
                        return err
                }
                if _, err := os.Stdout.Write(data); err != nil {
                        return err
                }
                return nil
        case "start":
                opts := StartOpts{
                        Address:      addressFlag,
                        TTRPCAddress: ttrpcAddress,
                        Debug:        debugFlag,
                }

                params, err := manager.Start(ctx, id, opts)
                if err != nil {
                        return err
                }

                data, err := json.Marshal(&params)
                if err != nil {
                        return fmt.Errorf("failed to marshal bootstrap params to json: %w", err)
                }

                if _, err := os.Stdout.Write(data); err != nil {
                        return err
                }

                return nil
        }

        if !config.NoSetupLogger {
                ctx, err = setLogger(ctx, id)
                if err != nil {
                        return err
                }
        }

        registry.Register(&plugin.Registration{
                Type: plugins.InternalPlugin,
                ID:   "shutdown",
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        return sd, nil
                },
        })

        // Register event plugin
        registry.Register(&plugin.Registration{
                Type: plugins.EventPlugin,
                ID:   "publisher",
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        return publisher, nil
                },
        })

        var (
                initialized   = plugin.NewPluginSet()
                ttrpcServices = []TTRPCService{}

                ttrpcUnaryInterceptors = []ttrpc.UnaryServerInterceptor{}
        )

        for _, p := range registry.Graph(func(*plugin.Registration) bool { return false }) {
                pID := p.URI()
                log.G(ctx).WithFields(log.Fields{"id": pID, "type": p.Type}).Debug("loading plugin")

                initContext := plugin.NewContext(
                        ctx,
                        initialized,
                        map[string]string{
                                // NOTE: Root is empty since the shim does not support persistent storage,
                                // shim plugins should make use state directory for writing files to disk.
                                // The state directory will be destroyed when the shim if cleaned up or
                                // on reboot
                                plugins.PropertyStateDir:     filepath.Join(bundlePath, p.URI()),
                                plugins.PropertyGRPCAddress:  addressFlag,
                                plugins.PropertyTTRPCAddress: ttrpcAddress,
                        },
                )

                // load the plugin specific configuration if it is provided
                // TODO: Read configuration passed into shim, or from state directory?
                // if p.Config != nil {
                //        pc, err := config.Decode(p)
                //        if err != nil {
                //                return nil, err
                //        }
                //        initContext.Config = pc
                // }

                result := p.Init(initContext)
                if err := initialized.Add(result); err != nil {
                        return fmt.Errorf("could not add plugin result to plugin set: %w", err)
                }

                instance, err := result.Instance()
                if err != nil {
                        if plugin.IsSkipPlugin(err) {
                                log.G(ctx).WithFields(log.Fields{"id": pID, "type": p.Type, "error": err}).Info("skip loading plugin")
                                continue
                        }
                        return fmt.Errorf("failed to load plugin %s: %w", pID, err)
                }

                if src, ok := instance.(TTRPCService); ok {
                        log.G(ctx).WithField("id", pID).Debug("registering ttrpc service")
                        ttrpcServices = append(ttrpcServices, src)

                }

                if src, ok := instance.(TTRPCServerOptioner); ok {
                        ttrpcUnaryInterceptors = append(ttrpcUnaryInterceptors, src.UnaryInterceptor())
                }
        }

        if len(ttrpcServices) == 0 {
                return fmt.Errorf("required that ttrpc service")
        }

        unaryInterceptor := chainUnaryServerInterceptors(ttrpcUnaryInterceptors...)
        server, err := newServer(ttrpc.WithUnaryServerInterceptor(unaryInterceptor))
        if err != nil {
                return fmt.Errorf("failed creating server: %w", err)
        }

        for _, srv := range ttrpcServices {
                if err := srv.RegisterTTRPC(server); err != nil {
                        return fmt.Errorf("failed to register service: %w", err)
                }
        }

        if err := serve(ctx, server, signals, sd.Shutdown); err != nil {
                if !errors.Is(err, shutdown.ErrShutdown) {
                        cleanupSockets(ctx)
                        return err
                }
        }

        // NOTE: If the shim server is down(like oom killer), the address
        // socket might be leaking.
        cleanupSockets(ctx)

        select {
        case <-sd.Done():
                return nil
        case <-time.After(5 * time.Second):
                return errors.New("shim shutdown timeout")
        }
}

// serve serves the ttrpc API over a unix socket in the current working directory
// and blocks until the context is canceled
func serve(ctx context.Context, server *ttrpc.Server, signals chan os.Signal, shutdown func()) error {
        dump := make(chan os.Signal, 32)
        setupDumpStacks(dump)

        path, err := os.Getwd()
        if err != nil {
                return err
        }

        l, err := serveListener(socketFlag)
        if err != nil {
                return err
        }
        go func() {
                defer l.Close()
                if err := server.Serve(ctx, l); err != nil && !errors.Is(err, net.ErrClosed) {
                        log.G(ctx).WithError(err).Fatal("containerd-shim: ttrpc server failure")
                }
        }()
        logger := log.G(ctx).WithFields(log.Fields{
                "pid":       os.Getpid(),
                "path":      path,
                "namespace": namespaceFlag,
        })
        go func() {
                for range dump {
                        dumpStacks(logger)
                }
        }()

        go handleExitSignals(ctx, logger, shutdown)
        return reap(ctx, logger, signals)
}

func dumpStacks(logger *log.Entry) {
        var (
                buf       []byte
                stackSize int
        )
        bufferLen := 16384
        for stackSize == len(buf) {
                buf = make([]byte, bufferLen)
                stackSize = runtime.Stack(buf, true)
                bufferLen *= 2
        }
        buf = buf[:stackSize]
        logger.Infof("=== BEGIN goroutine stack dump ===\n%s\n=== END goroutine stack dump ===", buf)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shim

import (
        "github.com/containerd/containerd/v2/pkg/sys/reaper"
        "github.com/containerd/ttrpc"
)

func newServer(opts ...ttrpc.ServerOpt) (*ttrpc.Server, error) {
        opts = append(opts, ttrpc.WithServerHandshaker(ttrpc.UnixSocketRequireSameUser()))
        return ttrpc.NewServer(opts...)
}

func subreaper() error {
        return reaper.SetSubreaper(1)
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shim

import (
        "context"
        "fmt"
        "io"
        "net"
        "os"
        "os/signal"
        "syscall"

        "github.com/containerd/containerd/v2/pkg/sys/reaper"
        "github.com/containerd/fifo"
        "github.com/containerd/log"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

// setupSignals creates a new signal handler for all signals and sets the shim as a
// sub-reaper so that the container processes are reparented
func setupSignals(config Config) (chan os.Signal, error) {
        signals := make(chan os.Signal, 32)
        smp := []os.Signal{unix.SIGTERM, unix.SIGINT, unix.SIGPIPE}
        if !config.NoReaper {
                smp = append(smp, unix.SIGCHLD)
        }
        signal.Notify(signals, smp...)
        return signals, nil
}

func setupDumpStacks(dump chan<- os.Signal) {
        signal.Notify(dump, syscall.SIGUSR1)
}

func serveListener(path string) (net.Listener, error) {
        var (
                l   net.Listener
                err error
        )
        if path == "" {
                l, err = net.FileListener(os.NewFile(3, "socket"))
                path = "[inherited from parent]"
        } else {
                if len(path) > socketPathLimit {
                        return nil, fmt.Errorf("%q: unix socket path too long (> %d)", path, socketPathLimit)
                }
                l, err = net.Listen("unix", path)
        }
        if err != nil {
                return nil, err
        }
        log.L.WithField("socket", path).Debug("serving api on socket")
        return l, nil
}

func reap(ctx context.Context, logger *logrus.Entry, signals chan os.Signal) error {
        logger.Debug("starting signal loop")

        for {
                select {
                case <-ctx.Done():
                        return ctx.Err()
                case s := <-signals:
                        // Exit signals are handled separately from this loop
                        // They get registered with this channel so that we can ignore such signals for short-running actions (e.g. `delete`)
                        switch s {
                        case unix.SIGCHLD:
                                if err := reaper.Reap(); err != nil {
                                        logger.WithError(err).Error("reap exit status")
                                }
                        case unix.SIGPIPE:
                        }
                }
        }
}

func handleExitSignals(ctx context.Context, logger *logrus.Entry, cancel context.CancelFunc) {
        ch := make(chan os.Signal, 32)
        signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM)

        for {
                select {
                case s := <-ch:
                        logger.WithField("signal", s).Debugf("Caught exit signal")
                        cancel()
                        return
                case <-ctx.Done():
                        return
                }
        }
}

func openLog(ctx context.Context, _ string) (io.Writer, error) {
        return fifo.OpenFifoDup2(ctx, "log", unix.O_WRONLY, 0700, int(os.Stderr.Fd()))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shim

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "net"
        "os"
        "os/exec"
        "path/filepath"
        "strings"
        "time"

        "github.com/containerd/ttrpc"
        "github.com/containerd/typeurl/v2"

        "github.com/containerd/containerd/v2/pkg/atomicfile"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
)

type CommandConfig struct {
        Runtime      string
        Address      string
        TTRPCAddress string
        Path         string
        SchedCore    bool
        Args         []string
        Opts         *types.Any
}

// Command returns the shim command with the provided args and configuration
func Command(ctx context.Context, config *CommandConfig) (*exec.Cmd, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return nil, err
        }
        self, err := os.Executable()
        if err != nil {
                return nil, err
        }
        args := []string{
                "-namespace", ns,
                "-address", config.Address,
                "-publish-binary", self,
        }
        args = append(args, config.Args...)
        cmd := exec.CommandContext(ctx, config.Runtime, args...)
        cmd.Dir = config.Path
        cmd.Env = append(
                os.Environ(),
                "GOMAXPROCS=2",
                fmt.Sprintf("%s=2", maxVersionEnv),
                fmt.Sprintf("%s=%s", ttrpcAddressEnv, config.TTRPCAddress),
                fmt.Sprintf("%s=%s", grpcAddressEnv, config.Address),
                fmt.Sprintf("%s=%s", namespaceEnv, ns),
        )
        if config.SchedCore {
                cmd.Env = append(cmd.Env, "SCHED_CORE=1")
        }
        cmd.SysProcAttr = getSysProcAttr()
        if config.Opts != nil {
                d, err := proto.Marshal(config.Opts)
                if err != nil {
                        return nil, err
                }
                cmd.Stdin = bytes.NewReader(d)
        }
        return cmd, nil
}

// BinaryName returns the shim binary name from the runtime name,
// empty string returns means runtime name is invalid
func BinaryName(runtime string) string {
        // runtime name should format like $prefix.name.version
        parts := strings.Split(runtime, ".")
        if len(parts) < 2 || parts[0] == "" {
                return ""
        }

        return fmt.Sprintf(shimBinaryFormat, parts[len(parts)-2], parts[len(parts)-1])
}

// BinaryPath returns the full path for the shim binary from the runtime name,
// empty string returns means runtime name is invalid
func BinaryPath(runtime string) string {
        dir := filepath.Dir(runtime)
        binary := BinaryName(runtime)

        path, err := filepath.Abs(filepath.Join(dir, binary))
        if err != nil {
                return ""
        }

        return path
}

// Connect to the provided address
func Connect(address string, d func(string, time.Duration) (net.Conn, error)) (net.Conn, error) {
        return d(address, 100*time.Second)
}

// WritePidFile writes a pid file atomically
func WritePidFile(path string, pid int) error {
        path, err := filepath.Abs(path)
        if err != nil {
                return err
        }
        f, err := atomicfile.New(path, 0o644)
        if err != nil {
                return err
        }
        _, err = fmt.Fprintf(f, "%d", pid)
        if err != nil {
                f.Cancel()
                return err
        }
        return f.Close()
}

// ErrNoAddress is returned when the address file has no content
var ErrNoAddress = errors.New("no shim address")

// ReadAddress returns the shim's socket address from the path
func ReadAddress(path string) (string, error) {
        path, err := filepath.Abs(path)
        if err != nil {
                return "", err
        }
        data, err := os.ReadFile(path)
        if err != nil {
                return "", err
        }
        if len(data) == 0 {
                return "", ErrNoAddress
        }
        return string(data), nil
}

// ReadRuntimeOptions reads config bytes from io.Reader and unmarshals it into the provided type.
// The type must be registered with typeurl.
//
// The function will return ErrNotFound, if the config is not provided.
// And ErrInvalidArgument, if unable to cast the config to the provided type T.
func ReadRuntimeOptions[T any](reader io.Reader) (T, error) {
        var config T

        data, err := io.ReadAll(reader)
        if err != nil {
                return config, fmt.Errorf("failed to read config bytes from stdin: %w", err)
        }

        if len(data) == 0 {
                return config, errdefs.ErrNotFound
        }

        var any types.Any
        if err := proto.Unmarshal(data, &any); err != nil {
                return config, err
        }

        v, err := typeurl.UnmarshalAny(&any)
        if err != nil {
                return config, err
        }

        config, ok := v.(T)
        if !ok {
                return config, fmt.Errorf("invalid type %T: %w", v, errdefs.ErrInvalidArgument)
        }

        return config, nil
}

// chainUnaryServerInterceptors creates a single ttrpc server interceptor from
// a chain of many interceptors executed from first to last.
func chainUnaryServerInterceptors(interceptors ...ttrpc.UnaryServerInterceptor) ttrpc.UnaryServerInterceptor {
        n := len(interceptors)

        // force to use default interceptor in ttrpc
        if n == 0 {
                return nil
        }

        return func(ctx context.Context, unmarshal ttrpc.Unmarshaler, info *ttrpc.UnaryServerInfo, method ttrpc.Method) (interface{}, error) {
                currentMethod := method

                for i := n - 1; i > 0; i-- {
                        interceptor := interceptors[i]
                        innerMethod := currentMethod

                        currentMethod = func(currentCtx context.Context, currentUnmarshal func(interface{}) error) (interface{}, error) {
                                return interceptor(currentCtx, currentUnmarshal, info, innerMethod)
                        }
                }
                return interceptors[0](ctx, unmarshal, info, currentMethod)
        }
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shim

import (
        "bufio"
        "context"
        "crypto/sha256"
        "errors"
        "fmt"
        "io"
        "math"
        "net"
        "os"
        "path/filepath"
        "runtime"
        "strconv"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/log"
        "github.com/mdlayher/vsock"

        "github.com/containerd/containerd/v2/defaults"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/pkg/sys"
)

const (
        shimBinaryFormat = "containerd-shim-%s-%s"
        socketPathLimit  = 106
        protoVsock       = "vsock"
        protoHybridVsock = "hvsock"
        protoUnix        = "unix"
)

func getSysProcAttr() *syscall.SysProcAttr {
        return &syscall.SysProcAttr{
                Setpgid: true,
        }
}

// AdjustOOMScore sets the OOM score for the process to the parents OOM score +1
// to ensure that they parent has a lower* score than the shim
// if not already at the maximum OOM Score
func AdjustOOMScore(pid int) error {
        parent := os.Getppid()
        score, err := sys.GetOOMScoreAdj(parent)
        if err != nil {
                return fmt.Errorf("get parent OOM score: %w", err)
        }
        shimScore := score + 1
        if err := sys.AdjustOOMScore(pid, shimScore); err != nil {
                return fmt.Errorf("set shim OOM score: %w", err)
        }
        return nil
}

const socketRoot = defaults.DefaultStateDir

// SocketAddress returns a socket address
func SocketAddress(ctx context.Context, socketPath, id string) (string, error) {
        ns, err := namespaces.NamespaceRequired(ctx)
        if err != nil {
                return "", err
        }
        d := sha256.Sum256([]byte(filepath.Join(socketPath, ns, id)))
        return fmt.Sprintf("unix://%s/%x", filepath.Join(socketRoot, "s"), d), nil
}

// AnonDialer returns a dialer for a socket
func AnonDialer(address string, timeout time.Duration) (net.Conn, error) {
        proto, addr, ok := strings.Cut(address, "://")
        if !ok {
                return net.DialTimeout("unix", socket(address).path(), timeout)
        }
        switch proto {
        case protoVsock:
                // vsock dialer can not set timeout
                return dialVsock(addr)
        case protoHybridVsock:
                return dialHybridVsock(addr, timeout)
        case protoUnix:
                return net.DialTimeout("unix", socket(address).path(), timeout)
        default:
                return nil, fmt.Errorf("unsupported protocol: %s", proto)
        }
}

// AnonReconnectDialer returns a dialer for an existing socket on reconnection
func AnonReconnectDialer(address string, timeout time.Duration) (net.Conn, error) {
        return AnonDialer(address, timeout)
}

// NewSocket returns a new socket
func NewSocket(address string) (*net.UnixListener, error) {
        var (
                sock       = socket(address)
                path       = sock.path()
                isAbstract = sock.isAbstract()
                perm       = os.FileMode(0600)
        )

        // Darwin needs +x to access socket, otherwise it'll fail with "bind: permission denied" when running as non-root.
        if runtime.GOOS == "darwin" {
                perm = 0700
        }

        if !isAbstract {
                if err := os.MkdirAll(filepath.Dir(path), perm); err != nil {
                        return nil, fmt.Errorf("mkdir failed for %s: %w", path, err)
                }
        }
        l, err := net.Listen("unix", path)
        if err != nil {
                return nil, err
        }

        if !isAbstract {
                if err := os.Chmod(path, perm); err != nil {
                        os.Remove(sock.path())
                        l.Close()
                        return nil, fmt.Errorf("chmod failed for %s: %w", path, err)
                }
        }

        return l.(*net.UnixListener), nil
}

const abstractSocketPrefix = "\x00"

type socket string

func (s socket) isAbstract() bool {
        return !strings.HasPrefix(string(s), "unix://")
}

func (s socket) path() string {
        path := strings.TrimPrefix(string(s), "unix://")
        // if there was no trim performed, we assume an abstract socket
        if len(path) == len(s) {
                path = abstractSocketPrefix + path
        }
        return path
}

// RemoveSocket removes the socket at the specified address if
// it exists on the filesystem
func RemoveSocket(address string) error {
        sock := socket(address)
        if !sock.isAbstract() {
                return os.Remove(sock.path())
        }
        return nil
}

// SocketEaddrinuse returns true if the provided error is caused by the
// EADDRINUSE error number
func SocketEaddrinuse(err error) bool {
        var netErr *net.OpError
        if errors.As(err, &netErr) {
                if netErr.Op != "listen" {
                        return false
                }
                return errors.Is(err, syscall.EADDRINUSE)
        }
        return false
}

// CanConnect returns true if the socket provided at the address
// is accepting new connections
func CanConnect(address string) bool {
        conn, err := AnonDialer(address, 100*time.Millisecond)
        if err != nil {
                return false
        }
        conn.Close()
        return true
}

func hybridVsockDialer(addr string, port uint64, timeout time.Duration) (net.Conn, error) {
        timeoutCh := time.After(timeout)
        // Do 10 retries before timeout
        retryInterval := timeout / 10
        for {
                conn, err := net.DialTimeout("unix", addr, timeout)
                if err != nil {
                        return nil, err
                }
                if _, err = conn.Write([]byte(fmt.Sprintf("CONNECT %d\n", port))); err != nil {
                        conn.Close()
                        return nil, err
                }
                errChan := make(chan error, 1)
                go func() {
                        reader := bufio.NewReader(conn)
                        response, err := reader.ReadString('\n')
                        if err != nil {
                                errChan <- err
                                return
                        }
                        if strings.Contains(response, "OK") {
                                errChan <- nil
                        } else {
                                errChan <- fmt.Errorf("hybrid vsock handshake response error: %s", response)
                        }
                }()
                select {
                case err = <-errChan:
                        if err != nil {
                                conn.Close()
                                // When it is EOF, maybe the server side is not ready.
                                if err == io.EOF {
                                        log.G(context.Background()).Warnf("Read hybrid vsock got EOF, server may not ready")
                                        time.Sleep(retryInterval)
                                        continue
                                }
                                return nil, err
                        }
                        return conn, nil
                case <-timeoutCh:
                        conn.Close()
                        return nil, fmt.Errorf("timeout waiting for hybrid vsocket handshake of %s:%d", addr, port)
                }
        }

}

func dialVsock(address string) (net.Conn, error) {
        contextIDString, portString, ok := strings.Cut(address, ":")
        if !ok {
                return nil, fmt.Errorf("invalid vsock address %s", address)
        }
        contextID, err := strconv.ParseUint(contextIDString, 10, 0)
        if err != nil {
                return nil, fmt.Errorf("failed to parse vsock context id %s, %v", contextIDString, err)
        }
        if contextID > math.MaxUint32 {
                return nil, fmt.Errorf("vsock context id %d is invalid", contextID)
        }
        port, err := strconv.ParseUint(portString, 10, 0)
        if err != nil {
                return nil, fmt.Errorf("failed to parse vsock port %s, %v", portString, err)
        }
        if port > math.MaxUint32 {
                return nil, fmt.Errorf("vsock port %d is invalid", port)
        }
        return vsock.Dial(uint32(contextID), uint32(port), &vsock.Config{})
}

func dialHybridVsock(address string, timeout time.Duration) (net.Conn, error) {
        addr, portString, ok := strings.Cut(address, ":")
        if !ok {
                return nil, fmt.Errorf("invalid hybrid vsock address %s", address)
        }
        port, err := strconv.ParseUint(portString, 10, 0)
        if err != nil {
                return nil, fmt.Errorf("failed to parse hybrid vsock port %s, %v", portString, err)
        }
        if port > math.MaxUint32 {
                return nil, fmt.Errorf("hybrid vsock port %d is invalid", port)
        }
        return hybridVsockDialer(addr, port, timeout)
}

func cleanupSockets(ctx context.Context) {
        if address, err := ReadAddress("address"); err == nil {
                _ = RemoveSocket(address)
        }
        if len(socketFlag) > 0 {
                _ = RemoveSocket("unix://" + socketFlag)
        } else if address, err := SocketAddress(ctx, addressFlag, id); err == nil {
                _ = RemoveSocket(address)
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package shutdown

import (
        "context"
        "errors"
        "sync"
        "time"

        "golang.org/x/sync/errgroup"
)

// ErrShutdown is the error condition when a context has been fully shutdown
var ErrShutdown = errors.New("shutdown")

// Service is used to facilitate shutdown by through callback
// registration and shutdown initiation
type Service interface {
        // Shutdown initiates shutdown
        Shutdown()
        // RegisterCallback registers functions to be called on shutdown and before
        // the shutdown channel is closed. A callback error will propagate to the
        // context error
        RegisterCallback(func(context.Context) error)
        // Done returns a channel that's closed when all shutdown callbacks are invoked.
        Done() <-chan struct{}
        // Err returns nil if Done is not yet closed.
        // If Done is closed, Err returns first failed callback error or ErrShutdown.
        Err() error
}

// WithShutdown returns a context which is similar to a cancel context, but
// with callbacks which can propagate to the context error. Unlike a cancel
// context, the shutdown context cannot be canceled from the parent context.
// However, future child contexes will be canceled upon shutdown.
func WithShutdown(ctx context.Context) (context.Context, Service) {
        ss := &shutdownService{
                Context: ctx,
                doneC:   make(chan struct{}),
                timeout: 30 * time.Second,
        }
        return ss, ss
}

type shutdownService struct {
        context.Context

        mu         sync.Mutex
        isShutdown bool
        callbacks  []func(context.Context) error
        doneC      chan struct{}
        err        error
        timeout    time.Duration
}

func (s *shutdownService) Shutdown() {
        s.mu.Lock()
        defer s.mu.Unlock()
        if s.isShutdown {
                return
        }
        s.isShutdown = true

        go func(callbacks []func(context.Context) error) {
                ctx, cancel := context.WithTimeout(context.Background(), s.timeout)
                defer cancel()
                grp, ctx := errgroup.WithContext(ctx)
                for i := range callbacks {
                        fn := callbacks[i]
                        grp.Go(func() error { return fn(ctx) })
                }
                err := grp.Wait()
                if err == nil {
                        err = ErrShutdown
                }
                s.mu.Lock()
                s.err = err
                close(s.doneC)
                s.mu.Unlock()
        }(s.callbacks)
}

func (s *shutdownService) Done() <-chan struct{} {
        return s.doneC
}

func (s *shutdownService) Err() error {
        s.mu.Lock()
        defer s.mu.Unlock()
        return s.err
}

func (s *shutdownService) RegisterCallback(fn func(context.Context) error) {
        s.mu.Lock()
        defer s.mu.Unlock()
        if s.callbacks == nil {
                s.callbacks = []func(context.Context) error{}
        }
        s.callbacks = append(s.callbacks, fn)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package snapshotters

import (
        "context"

        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/log"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// NOTE: The following labels contain "cri" prefix but they are not specific to CRI and
// can be used by non-CRI clients as well for enabling remote snapshotters. We need to
// retain that string for keeping compatibility with snapshotter implementations.
const (
        // TargetRefLabel is a label which contains image reference and will be passed
        // to snapshotters.
        TargetRefLabel = "containerd.io/snapshot/cri.image-ref"
        // TargetManifestDigestLabel is a label which contains manifest digest and will be passed
        // to snapshotters.
        TargetManifestDigestLabel = "containerd.io/snapshot/cri.manifest-digest"
        // TargetLayerDigestLabel is a label which contains layer digest and will be passed
        // to snapshotters.
        TargetLayerDigestLabel = "containerd.io/snapshot/cri.layer-digest"
        // TargetImageLayersLabel is a label which contains layer digests contained in
        // the target image and will be passed to snapshotters for preparing layers in
        // parallel. Skipping some layers is allowed and only affects performance.
        TargetImageLayersLabel = "containerd.io/snapshot/cri.image-layers"
)

// AppendInfoHandlerWrapper makes a handler which appends some basic information
// of images like digests for manifest and their child layers as annotations during unpack.
// These annotations will be passed to snapshotters as labels. These labels will be
// used mainly by remote snapshotters for querying image contents from the remote location.
func AppendInfoHandlerWrapper(ref string) func(f images.Handler) images.Handler {
        return func(f images.Handler) images.Handler {
                return images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                        children, err := f.Handle(ctx, desc)
                        if err != nil {
                                return nil, err
                        }
                        if images.IsManifestType(desc.MediaType) {
                                for i := range children {
                                        c := &children[i]
                                        if images.IsLayerType(c.MediaType) {
                                                if c.Annotations == nil {
                                                        c.Annotations = make(map[string]string)
                                                }
                                                c.Annotations[TargetRefLabel] = ref
                                                c.Annotations[TargetLayerDigestLabel] = c.Digest.String()
                                                c.Annotations[TargetImageLayersLabel] = getLayers(ctx, TargetImageLayersLabel, children[i:], labels.Validate)
                                                c.Annotations[TargetManifestDigestLabel] = desc.Digest.String()
                                        }
                                }
                        }
                        return children, nil
                })
        }
}

// getLayers returns comma-separated digests based on the passed list of
// descriptors. The returned list contains as many digests as possible as well
// as meets the label validation.
func getLayers(ctx context.Context, key string, descs []ocispec.Descriptor, validate func(k, v string) error) (layers string) {
        for _, l := range descs {
                if images.IsLayerType(l.MediaType) {
                        item := l.Digest.String()
                        if layers != "" {
                                item = "," + item
                        }
                        // This avoids the label hits the size limitation.
                        if err := validate(key, layers+item); err != nil {
                                log.G(ctx).WithError(err).WithField("label", key).WithField("digest", l.Digest.String()).Debug("omitting digest in the layers list")
                                break
                        }
                        layers += item
                }
        }
        return
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sys

import "golang.org/x/sys/unix"

// The following contents were copied from Go 1.18.2.
// Use of this source code is governed by the following
// BSD-style license:
//
// Copyright (c) 2009 The Go Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//   * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//   * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//   * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// IgnoringEINTR makes a function call and repeats it if it returns an
// EINTR error. This appears to be required even though we install all
// signal handlers with SA_RESTART: see #22838, #38033, #38836, #40846.
// Also #20400 and #36644 are issues in which a signal handler is
// installed without setting SA_RESTART. None of these are the common case,
// but there are enough of them that it seems that we can't avoid
// an EINTR loop.
func IgnoringEINTR(fn func() error) error {
        for {
                err := fn()
                if err != unix.EINTR {
                        return err
                }
        }
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sys

import "os"

// MkdirAllWithACL is a wrapper for os.MkdirAll on Unix systems.
func MkdirAllWithACL(path string, perm os.FileMode) error {
        return os.MkdirAll(path, perm)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sys

import (
        "fmt"
        "os"
        "strconv"
        "strings"

        "github.com/containerd/containerd/v2/pkg/userns"
        "golang.org/x/sys/unix"
)

const (
        // OOMScoreAdjMin is from OOM_SCORE_ADJ_MIN https://github.com/torvalds/linux/blob/v5.10/include/uapi/linux/oom.h#L9
        OOMScoreAdjMin = -1000
        // OOMScoreAdjMax is from OOM_SCORE_ADJ_MAX https://github.com/torvalds/linux/blob/v5.10/include/uapi/linux/oom.h#L10
        OOMScoreAdjMax = 1000
)

// AdjustOOMScore sets the oom score for the provided pid. If the provided score
// is out of range (-1000 - 1000), it is clipped to the min/max value.
func AdjustOOMScore(pid, score int) error {
        if score > OOMScoreAdjMax {
                score = OOMScoreAdjMax
        } else if score < OOMScoreAdjMin {
                score = OOMScoreAdjMin
        }
        return SetOOMScore(pid, score)
}

// SetOOMScore sets the oom score for the provided pid
func SetOOMScore(pid, score int) error {
        if score > OOMScoreAdjMax || score < OOMScoreAdjMin {
                return fmt.Errorf("value out of range (%d): OOM score must be between %d and %d", score, OOMScoreAdjMin, OOMScoreAdjMax)
        }
        path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
        f, err := os.OpenFile(path, os.O_WRONLY, 0)
        if err != nil {
                return err
        }
        defer f.Close()
        if _, err = f.WriteString(strconv.Itoa(score)); err != nil {
                if os.IsPermission(err) && (!runningPrivileged() || userns.RunningInUserNS()) {
                        return nil
                }
                return err
        }
        return nil
}

// GetOOMScoreAdj gets the oom score for a process. It returns 0 (zero) if either
// no oom score is set, or a sore is set to 0.
func GetOOMScoreAdj(pid int) (int, error) {
        path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
        data, err := os.ReadFile(path)
        if err != nil {
                return 0, err
        }
        return strconv.Atoi(strings.TrimSpace(string(data)))
}

// runningPrivileged returns true if the effective user ID of the
// calling process is 0
func runningPrivileged() bool {
        return unix.Geteuid() == 0
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package reaper

import (
        "errors"
        "fmt"
        "os/exec"
        "runtime"
        "sync"
        "syscall"
        "time"

        runc "github.com/containerd/go-runc"
        "golang.org/x/sys/unix"
)

// ErrNoSuchProcess is returned when the process no longer exists
var ErrNoSuchProcess = errors.New("no such process")

const bufferSize = 32

type subscriber struct {
        sync.Mutex
        c      chan runc.Exit
        closed bool
}

func (s *subscriber) close() {
        s.Lock()
        if s.closed {
                s.Unlock()
                return
        }
        close(s.c)
        s.closed = true
        s.Unlock()
}

func (s *subscriber) do(fn func()) {
        s.Lock()
        fn()
        s.Unlock()
}

// Reap should be called when the process receives an SIGCHLD.  Reap will reap
// all exited processes and close their wait channels
func Reap() error {
        now := time.Now()
        exits, err := reap(false)
        for _, e := range exits {
                done := Default.notify(runc.Exit{
                        Timestamp: now,
                        Pid:       e.Pid,
                        Status:    e.Status,
                })

                select {
                case <-done:
                case <-time.After(1 * time.Second):
                }
        }
        return err
}

// Default is the default monitor initialized for the package
var Default = &Monitor{
        subscribers: make(map[chan runc.Exit]*subscriber),
}

// Monitor monitors the underlying system for process status changes
type Monitor struct {
        sync.Mutex

        subscribers map[chan runc.Exit]*subscriber
}

// Start starts the command and registers the process with the reaper
func (m *Monitor) Start(c *exec.Cmd) (chan runc.Exit, error) {
        ec := m.Subscribe()
        if err := c.Start(); err != nil {
                m.Unsubscribe(ec)
                return nil, err
        }
        return ec, nil
}

// StartLocked starts the command and registers the process with the reaper
func (m *Monitor) StartLocked(c *exec.Cmd) (chan runc.Exit, error) {
        runtime.LockOSThread()
        defer runtime.UnlockOSThread()
        return m.Start(c)
}

// Wait blocks until a process is signal as dead.
// User should rely on the value of the exit status to determine if the
// command was successful or not.
func (m *Monitor) Wait(c *exec.Cmd, ec chan runc.Exit) (int, error) {
        for e := range ec {
                if e.Pid == c.Process.Pid {
                        // make sure we flush all IO
                        c.Wait()
                        m.Unsubscribe(ec)
                        return e.Status, nil
                }
        }
        // return no such process if the ec channel is closed and no more exit
        // events will be sent
        return -1, ErrNoSuchProcess
}

// WaitTimeout is used to skip the blocked command and kill the left process.
func (m *Monitor) WaitTimeout(c *exec.Cmd, ec chan runc.Exit, timeout time.Duration) (int, error) {
        type exitStatusWrapper struct {
                status int
                err    error
        }

        // capacity can make sure that the following goroutine will not be
        // blocked if there is no receiver when timeout.
        waitCh := make(chan *exitStatusWrapper, 1)
        go func() {
                defer close(waitCh)

                status, err := m.Wait(c, ec)
                waitCh <- &exitStatusWrapper{
                        status: status,
                        err:    err,
                }
        }()

        timer := time.NewTimer(timeout)
        defer timer.Stop()

        select {
        case <-timer.C:
                syscall.Kill(c.Process.Pid, syscall.SIGKILL)
                return 0, fmt.Errorf("timeout %v for cmd(pid=%d): %s, %s", timeout, c.Process.Pid, c.Path, c.Args)
        case res := <-waitCh:
                return res.status, res.err
        }
}

// Subscribe to process exit changes
func (m *Monitor) Subscribe() chan runc.Exit {
        c := make(chan runc.Exit, bufferSize)
        m.Lock()
        m.subscribers[c] = &subscriber{
                c: c,
        }
        m.Unlock()
        return c
}

// Unsubscribe to process exit changes
func (m *Monitor) Unsubscribe(c chan runc.Exit) {
        m.Lock()
        s, ok := m.subscribers[c]
        if !ok {
                m.Unlock()
                return
        }
        s.close()
        delete(m.subscribers, c)
        m.Unlock()
}

func (m *Monitor) getSubscribers() map[chan runc.Exit]*subscriber {
        out := make(map[chan runc.Exit]*subscriber)
        m.Lock()
        for k, v := range m.subscribers {
                out[k] = v
        }
        m.Unlock()
        return out
}

func (m *Monitor) notify(e runc.Exit) chan struct{} {
        const timeout = 1 * time.Millisecond
        var (
                done    = make(chan struct{}, 1)
                timer   = time.NewTimer(timeout)
                success = make(map[chan runc.Exit]struct{})
        )
        stop(timer, true)

        go func() {
                defer close(done)

                for {
                        var (
                                failed      int
                                subscribers = m.getSubscribers()
                        )
                        for _, s := range subscribers {
                                s.do(func() {
                                        if s.closed {
                                                return
                                        }
                                        if _, ok := success[s.c]; ok {
                                                return
                                        }
                                        timer.Reset(timeout)
                                        recv := true
                                        select {
                                        case s.c <- e:
                                                success[s.c] = struct{}{}
                                        case <-timer.C:
                                                recv = false
                                                failed++
                                        }
                                        stop(timer, recv)
                                })
                        }
                        // all subscribers received the message
                        if failed == 0 {
                                return
                        }
                }
        }()
        return done
}

func stop(timer *time.Timer, recv bool) {
        if !timer.Stop() && recv {
                <-timer.C
        }
}

// exit is the wait4 information from an exited process
type exit struct {
        Pid    int
        Status int
}

// reap reaps all child processes for the calling process and returns their
// exit information
func reap(wait bool) (exits []exit, err error) {
        var (
                ws  unix.WaitStatus
                rus unix.Rusage
        )
        flag := unix.WNOHANG
        if wait {
                flag = 0
        }
        for {
                pid, err := unix.Wait4(-1, &ws, flag, &rus)
                if err != nil {
                        if err == unix.ECHILD {
                                return exits, nil
                        }
                        return exits, err
                }
                if pid <= 0 {
                        return exits, nil
                }
                exits = append(exits, exit{
                        Pid:    pid,
                        Status: exitStatus(ws),
                })
        }
}

const exitSignalOffset = 128

// exitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func exitStatus(status unix.WaitStatus) int {
        if status.Signaled() {
                return exitSignalOffset + int(status.Signal())
        }
        return status.ExitStatus()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package reaper

import (
        "unsafe"

        "golang.org/x/sys/unix"
)

// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
        return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}

// GetSubreaper returns the subreaper setting for the calling process
func GetSubreaper() (int, error) {
        var i uintptr

        if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
                return -1, err
        }

        return int(i), nil
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sys

import (
        "fmt"
        "net"
        "os"
        "path/filepath"

        "golang.org/x/sys/unix"
)

// CreateUnixSocket creates a unix socket and returns the listener
func CreateUnixSocket(path string) (net.Listener, error) {
        // BSDs have a 104 limit
        if len(path) > 104 {
                return nil, fmt.Errorf("%q: unix socket path too long (> 104)", path)
        }
        if err := os.MkdirAll(filepath.Dir(path), 0660); err != nil {
                return nil, err
        }
        if err := unix.Unlink(path); err != nil && !os.IsNotExist(err) {
                return nil, err
        }
        return net.Listen("unix", path)
}

// GetLocalListener returns a listener out of a unix socket.
func GetLocalListener(path string, uid, gid int) (net.Listener, error) {
        // Ensure parent directory is created
        if err := mkdirAs(filepath.Dir(path), uid, gid); err != nil {
                return nil, err
        }

        l, err := CreateUnixSocket(path)
        if err != nil {
                return l, err
        }

        if err := os.Chmod(path, 0660); err != nil {
                l.Close()
                return nil, err
        }

        if err := os.Chown(path, uid, gid); err != nil {
                l.Close()
                return nil, err
        }

        return l, nil
}

func mkdirAs(path string, uid, gid int) error {
        if _, err := os.Stat(path); !os.IsNotExist(err) {
                return err
        }

        if err := os.MkdirAll(path, 0770); err != nil {
                return err
        }

        return os.Chown(path, uid, gid)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sys

import (
        "runtime"
        "syscall"
        "unsafe"
)

// ForkUserns is to fork child process with user namespace. It returns child
// process's pid and pidfd reference to the child process.
//
// Precondition: The runtime OS thread must be locked, which is GO runtime
// requirement.
//
// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that
// the parent process's OS thread must be locked. Otherwise, the exit event of
// parent process's OS thread will send kill signal to the child process,
// even if parent process is still running.
//
//go:norace
//go:noinline
func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) {
        var (
                pidfd     uintptr
                pid, ppid uintptr
                err       syscall.Errno
        )

        ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0)
        if err != 0 {
                return 0, 0, err
        }

        beforeFork()
        if runtime.GOARCH == "s390x" {
                // NOTE:
                //
                // On the s390 architectures, the order of the first two
                // arguments is reversed.
                //
                // REF: https://man7.org/linux/man-pages/man2/clone.2.html
                pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
                        0,
                        uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
                        uintptr(unsafe.Pointer(&pidfd)),
                )
        } else {
                pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
                        uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
                        0,
                        uintptr(unsafe.Pointer(&pidfd)),
                )
        }
        if err != 0 || pid != 0 {
                afterFork()
                return pid, pidfd, err
        }
        afterForkInChild()

        if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
                goto err
        }

        pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0)
        if err != 0 {
                goto err
        }

        // exit if re-parent
        if pid != ppid {
                goto err
        }

        _, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0)
err:
        syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
        panic("unreachable")
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package testutil

import (
        "flag"
        "fmt"
        "os"
        "path/filepath"
        "strconv"
        "testing"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/stretchr/testify/assert"
)

const umountflags int = 0

var rootEnabled bool

func init() {
        flag.BoolVar(&rootEnabled, "test.root", false, "enable tests that require root")
}

// DumpDir prints the contents of the directory to the testing logger.
//
// Use this in a defer statement from a test that may allocate and exercise a
// temporary directory. Immensely useful for sanity checking and debugging
// failing tests.
//
// One should still test that contents are as expected. This is only a visual
// tool to assist when things don't go your way.
func DumpDir(t *testing.T, root string) {
        if err := filepath.Walk(root, func(path string, fi os.FileInfo, err error) error {
                if err != nil {
                        return err
                }

                if fi.Mode()&os.ModeSymlink != 0 {
                        target, err := os.Readlink(path)
                        if err != nil {
                                return err
                        }
                        t.Log(fi.Mode(), fmt.Sprintf("%10s", ""), path, "->", target)
                } else if fi.Mode().IsRegular() {
                        p, err := os.ReadFile(path)
                        if err != nil {
                                t.Logf("error reading file: %v", err)
                                return nil
                        }

                        if len(p) > 64 { // just display a little bit.
                                p = p[:64]
                        }
                        t.Log(fi.Mode(), fmt.Sprintf("%10d", fi.Size()), path, "[", strconv.Quote(string(p)), "...]")
                } else {
                        t.Log(fi.Mode(), fmt.Sprintf("%10d", fi.Size()), path)
                }

                return nil
        }); err != nil {
                t.Fatalf("error dumping directory: %v", err)
        }
}

// DumpDirOnFailure prints the contents of the directory to the testing logger if
// the test has failed.
func DumpDirOnFailure(t *testing.T, root string) {
        if t.Failed() {
                DumpDir(t, root)
        }
}

// Unmount unmounts a given mountPoint and sets t.Error if it fails
func Unmount(t testing.TB, mountPoint string) {
        t.Log("unmount", mountPoint)
        err := mount.UnmountAll(mountPoint, umountflags)
        assert.NoError(t, err)
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package testutil

import (
        "fmt"
        "os"
        "testing"

        "github.com/stretchr/testify/assert"
)

// RequiresRoot skips tests that require root, unless the test.root flag has
// been set
func RequiresRoot(t testing.TB) {
        if !rootEnabled {
                t.Skip("skipping test that requires root")
        }
        assert.Equal(t, 0, os.Getuid(), "This test must be run as root.")
}

// RequiresRootM is similar to RequiresRoot but intended to be called from *testing.M.
func RequiresRootM() {
        if !rootEnabled {
                fmt.Fprintln(os.Stderr, "skipping test that requires root")
                os.Exit(0)
        }
        if os.Getuid() != 0 {
                fmt.Fprintln(os.Stderr, "This test must be run as root.")
                os.Exit(1)
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package timeout

import (
        "context"
        "sync"
        "time"
)

var (
        mu       sync.RWMutex
        timeouts = make(map[string]time.Duration)

        // DefaultTimeout of the timeout package
        DefaultTimeout = 1 * time.Second
)

// Set the timeout for the key
func Set(key string, t time.Duration) {
        mu.Lock()
        timeouts[key] = t
        mu.Unlock()
}

// Get returns the timeout for the provided key
func Get(key string) time.Duration {
        mu.RLock()
        t, ok := timeouts[key]
        mu.RUnlock()
        if !ok {
                t = DefaultTimeout
        }
        return t
}

// WithContext returns a context with the specified timeout for the provided key
func WithContext(ctx context.Context, key string) (context.Context, func()) {
        t := Get(key)
        return context.WithTimeout(ctx, t)
}

// All returns all keys and their timeouts
func All() map[string]time.Duration {
        out := make(map[string]time.Duration)
        mu.RLock()
        defer mu.RUnlock()
        for k, v := range timeouts {
                out[k] = v
        }
        return out
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package tracing

import (
        "encoding/json"
        "fmt"
        "strings"

        "go.opentelemetry.io/otel/attribute"
)

const (
        spanDelimiter = "."
)

func makeSpanName(names ...string) string {
        return strings.Join(names, spanDelimiter)
}

func any(k string, v interface{}) attribute.KeyValue {
        if v == nil {
                return attribute.String(k, "<nil>")
        }

        switch typed := v.(type) {
        case bool:
                return attribute.Bool(k, typed)
        case []bool:
                return attribute.BoolSlice(k, typed)
        case int:
                return attribute.Int(k, typed)
        case []int:
                return attribute.IntSlice(k, typed)
        case int8:
                return attribute.Int(k, int(typed))
        case []int8:
                ls := make([]int, 0, len(typed))
                for _, i := range typed {
                        ls = append(ls, int(i))
                }
                return attribute.IntSlice(k, ls)
        case int16:
                return attribute.Int(k, int(typed))
        case []int16:
                ls := make([]int, 0, len(typed))
                for _, i := range typed {
                        ls = append(ls, int(i))
                }
                return attribute.IntSlice(k, ls)
        case int32:
                return attribute.Int64(k, int64(typed))
        case []int32:
                ls := make([]int64, 0, len(typed))
                for _, i := range typed {
                        ls = append(ls, int64(i))
                }
                return attribute.Int64Slice(k, ls)
        case int64:
                return attribute.Int64(k, typed)
        case []int64:
                return attribute.Int64Slice(k, typed)
        case float64:
                return attribute.Float64(k, typed)
        case []float64:
                return attribute.Float64Slice(k, typed)
        case string:
                return attribute.String(k, typed)
        case []string:
                return attribute.StringSlice(k, typed)
        }

        if stringer, ok := v.(fmt.Stringer); ok {
                return attribute.String(k, stringer.String())
        }
        if b, err := json.Marshal(v); b != nil && err == nil {
                return attribute.String(k, string(b))
        }
        return attribute.String(k, fmt.Sprintf("%v", v))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package tracing

import (
        "github.com/sirupsen/logrus"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

// NewLogrusHook creates a new logrus hook
func NewLogrusHook() *LogrusHook {
        return &LogrusHook{}
}

// LogrusHook is a logrus hook which adds logrus events to active spans.
// If the span is not recording or the span context is invalid, the hook is a no-op.
type LogrusHook struct{}

// Levels returns the logrus levels that this hook is interested in.
func (h *LogrusHook) Levels() []logrus.Level {
        return logrus.AllLevels
}

// Fire is called when a log event occurs.
func (h *LogrusHook) Fire(entry *logrus.Entry) error {
        span := trace.SpanFromContext(entry.Context)
        if span == nil {
                return nil
        }

        if !span.SpanContext().IsValid() || !span.IsRecording() {
                return nil
        }

        span.AddEvent(
                entry.Message,
                trace.WithAttributes(logrusDataToAttrs(entry.Data)...),
                trace.WithAttributes(attribute.String("level", entry.Level.String())),
                trace.WithTimestamp(entry.Time),
        )

        return nil
}

func logrusDataToAttrs(data logrus.Fields) []attribute.KeyValue {
        attrs := make([]attribute.KeyValue, 0, len(data))
        for k, v := range data {
                attrs = append(attrs, any(k, v))
        }
        return attrs
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package tracing

import (
        "context"
        "net/http"

        "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/codes"
        semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
        "go.opentelemetry.io/otel/trace"
)

// StartConfig defines configuration for a new span object.
type StartConfig struct {
        spanOpts []trace.SpanStartOption
}

type SpanOpt func(config *StartConfig)

// UpdateHTTPClient updates the http client with the necessary otel transport
func UpdateHTTPClient(client *http.Client, name string) {
        client.Transport = otelhttp.NewTransport(
                client.Transport,
                otelhttp.WithSpanNameFormatter(func(operation string, r *http.Request) string {
                        return name
                }),
        )
}

// StartSpan starts child span in a context.
func StartSpan(ctx context.Context, opName string, opts ...SpanOpt) (context.Context, *Span) {
        config := StartConfig{}
        for _, fn := range opts {
                fn(&config)
        }
        tracer := otel.Tracer("")
        if parent := trace.SpanFromContext(ctx); parent != nil && parent.SpanContext().IsValid() {
                tracer = parent.TracerProvider().Tracer("")
        }
        ctx, span := tracer.Start(ctx, opName, config.spanOpts...)
        return ctx, &Span{otelSpan: span}
}

// SpanFromContext returns the current Span from the context.
func SpanFromContext(ctx context.Context) *Span {
        return &Span{
                otelSpan: trace.SpanFromContext(ctx),
        }
}

// Span is wrapper around otel trace.Span.
// Span is the individual component of a trace. It represents a
// single named and timed operation of a workflow that is traced.
type Span struct {
        otelSpan trace.Span
}

// End completes the span.
func (s *Span) End() {
        s.otelSpan.End()
}

// AddEvent adds an event with provided name and options.
func (s *Span) AddEvent(name string, options ...trace.EventOption) {
        s.otelSpan.AddEvent(name, options...)
}

// SetStatus sets the status of the current span.
// If an error is encountered, it records the error and sets span status to Error.
func (s *Span) SetStatus(err error) {
        if err != nil {
                s.otelSpan.RecordError(err)
                s.otelSpan.SetStatus(codes.Error, err.Error())
        } else {
                s.otelSpan.SetStatus(codes.Ok, "")
        }
}

// SetAttributes sets kv as attributes of the span.
func (s *Span) SetAttributes(kv ...attribute.KeyValue) {
        s.otelSpan.SetAttributes(kv...)
}

// Name sets the span name by joining a list of strings in dot separated format.
func Name(names ...string) string {
        return makeSpanName(names...)
}

// Attribute takes a key value pair and returns attribute.KeyValue type.
func Attribute(k string, v interface{}) attribute.KeyValue {
        return any(k, v)
}

// HTTPStatusCodeAttributes generates attributes of the HTTP namespace as specified by the OpenTelemetry
// specification for a span.
func HTTPStatusCodeAttributes(code int) []attribute.KeyValue {
        return []attribute.KeyValue{semconv.HTTPStatusCodeKey.Int(code)}
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package ttrpcutil

import (
        "context"
        "errors"
        "fmt"
        "sync"
        "time"

        v1 "github.com/containerd/containerd/api/services/ttrpc/events/v1"
        "github.com/containerd/containerd/v2/pkg/dialer"
        "github.com/containerd/ttrpc"
)

const ttrpcDialTimeout = 5 * time.Second

type ttrpcConnector func() (*ttrpc.Client, error)

// Client is the client to interact with TTRPC part of containerd server (plugins, events)
type Client struct {
        mu        sync.Mutex
        connector ttrpcConnector
        client    *ttrpc.Client
        closed    bool
}

// NewClient returns a new containerd TTRPC client that is connected to the containerd instance provided by address
func NewClient(address string, opts ...ttrpc.ClientOpts) (*Client, error) {
        connector := func() (*ttrpc.Client, error) {
                ctx, cancel := context.WithTimeout(context.Background(), ttrpcDialTimeout)
                defer cancel()
                conn, err := dialer.ContextDialer(ctx, address)
                if err != nil {
                        return nil, fmt.Errorf("failed to connect: %w", err)
                }

                client := ttrpc.NewClient(conn, opts...)
                return client, nil
        }

        return &Client{
                connector: connector,
        }, nil
}

// Reconnect re-establishes the TTRPC connection to the containerd daemon
func (c *Client) Reconnect() error {
        c.mu.Lock()
        defer c.mu.Unlock()

        if c.connector == nil {
                return errors.New("unable to reconnect to containerd, no connector available")
        }

        if c.closed {
                return errors.New("client is closed")
        }

        if c.client != nil {
                if err := c.client.Close(); err != nil {
                        return err
                }
        }

        client, err := c.connector()
        if err != nil {
                return err
        }

        c.client = client
        return nil
}

// EventsService creates an EventsService client
func (c *Client) EventsService() (v1.EventsService, error) {
        client, err := c.Client()
        if err != nil {
                return nil, err
        }
        return v1.NewEventsClient(client), nil
}

// Client returns the underlying TTRPC client object
func (c *Client) Client() (*ttrpc.Client, error) {
        c.mu.Lock()
        defer c.mu.Unlock()
        if c.client == nil {
                client, err := c.connector()
                if err != nil {
                        return nil, err
                }
                c.client = client
        }
        return c.client, nil
}

// Close closes the clients TTRPC connection to containerd
func (c *Client) Close() error {
        c.mu.Lock()
        defer c.mu.Unlock()

        c.closed = true
        if c.client != nil {
                return c.client.Close()
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package userns

import (
        "bufio"
        "fmt"
        "os"
        "sync"
)

var (
        inUserNS bool
        nsOnce   sync.Once
)

// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
        nsOnce.Do(func() {
                file, err := os.Open("/proc/self/uid_map")
                if err != nil {
                        // This kernel-provided file only exists if user namespaces are supported
                        return
                }
                defer file.Close()

                buf := bufio.NewReader(file)
                l, _, err := buf.ReadLine()
                if err != nil {
                        return
                }

                line := string(l)
                var a, b, c int64
                fmt.Sscanf(line, "%d %d %d", &a, &b, &c)

                /*
                 * We assume we are in the initial user namespace if we have a full
                 * range - 4294967295 uids starting at uid 0.
                 */
                if a == 0 && b == 0 && c == 4294967295 {
                        return
                }
                inUserNS = true
        })
        return inUserNS
}

//go:build gofuzz

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "bufio"
        "bytes"
        "context"
        _ "crypto/sha256"
        "io"
        "testing"

        "github.com/opencontainers/go-digest"

        "github.com/containerd/containerd/v2/core/content"
)

func FuzzContentStoreWriter(data []byte) int {
        t := &testing.T{}
        ctx := context.Background()
        ctx, _, cs, cleanup := contentStoreEnv(t)
        defer cleanup()

        cw, err := cs.Writer(ctx, content.WithRef("myref"))
        if err != nil {
                return 0
        }
        if err := cw.Close(); err != nil {
                return 0
        }

        // reopen, so we can test things
        cw, err = cs.Writer(ctx, content.WithRef("myref"))
        if err != nil {
                return 0
        }

        err = checkCopyFuzz(int64(len(data)), cw, bufio.NewReader(io.NopCloser(bytes.NewReader(data))))
        if err != nil {
                return 0
        }
        expected := digest.FromBytes(data)

        if err = cw.Commit(ctx, int64(len(data)), expected); err != nil {
                return 0
        }
        return 1
}

func checkCopyFuzz(size int64, dst io.Writer, src io.Reader) error {
        nn, err := io.Copy(dst, src)
        if err != nil {
                return err
        }

        if nn != size {
                return err
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "fmt"
        "sync"
        "time"

        "github.com/containerd/errdefs"
)

// Handles locking references

type lock struct {
        since time.Time
}

var (
        // locks lets us lock in process
        locks   = make(map[string]*lock)
        locksMu sync.Mutex
)

func tryLock(ref string) error {
        locksMu.Lock()
        defer locksMu.Unlock()

        if v, ok := locks[ref]; ok {
                // Returning the duration may help developers distinguish dead locks (long duration) from
                // lock contentions (short duration).
                now := time.Now()
                return fmt.Errorf(
                        "ref %s locked for %s (since %s): %w", ref, now.Sub(v.since), v.since,
                        errdefs.ErrUnavailable,
                )
        }

        locks[ref] = &lock{time.Now()}
        return nil
}

func unlock(ref string) {
        locksMu.Lock()
        defer locksMu.Unlock()

        delete(locks, ref)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "fmt"
        "io"
        "os"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/errdefs"
)

// readerat implements io.ReaderAt in a completely stateless manner by opening
// the referenced file for each call to ReadAt.
type sizeReaderAt struct {
        size int64
        fp   *os.File
}

// OpenReader creates ReaderAt from a file
func OpenReader(p string) (content.ReaderAt, error) {
        fi, err := os.Stat(p)
        if err != nil {
                if !os.IsNotExist(err) {
                        return nil, err
                }

                return nil, fmt.Errorf("blob not found: %w", errdefs.ErrNotFound)
        }

        fp, err := os.Open(p)
        if err != nil {
                if !os.IsNotExist(err) {
                        return nil, err
                }

                return nil, fmt.Errorf("blob not found: %w", errdefs.ErrNotFound)
        }

        return sizeReaderAt{size: fi.Size(), fp: fp}, nil
}

func (ra sizeReaderAt) ReadAt(p []byte, offset int64) (int, error) {
        return ra.fp.ReadAt(p, offset)
}

func (ra sizeReaderAt) Size() int64 {
        return ra.size
}

func (ra sizeReaderAt) Close() error {
        return ra.fp.Close()
}

func (ra sizeReaderAt) Reader() io.Reader {
        return io.LimitReader(ra.fp, ra.size)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"

        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

var bufPool = sync.Pool{
        New: func() interface{} {
                buffer := make([]byte, 1<<20)
                return &buffer
        },
}

// LabelStore is used to store mutable labels for digests
type LabelStore interface {
        // Get returns all the labels for the given digest
        Get(digest.Digest) (map[string]string, error)

        // Set sets all the labels for a given digest
        Set(digest.Digest, map[string]string) error

        // Update replaces the given labels for a digest,
        // a key with an empty value removes a label.
        Update(digest.Digest, map[string]string) (map[string]string, error)
}

// Store is digest-keyed store for content. All data written into the store is
// stored under a verifiable digest.
//
// Store can generally support multi-reader, single-writer ingest of data,
// including resumable ingest.
type store struct {
        root string
        ls   LabelStore
}

// NewStore returns a local content store
func NewStore(root string) (content.Store, error) {
        return NewLabeledStore(root, nil)
}

// NewLabeledStore returns a new content store using the provided label store
//
// Note: content stores which are used underneath a metadata store may not
// require labels and should use `NewStore`. `NewLabeledStore` is primarily
// useful for tests or standalone implementations.
func NewLabeledStore(root string, ls LabelStore) (content.Store, error) {
        if err := os.MkdirAll(filepath.Join(root, "ingest"), 0777); err != nil {
                return nil, err
        }

        return &store{
                root: root,
                ls:   ls,
        }, nil
}

func (s *store) Info(ctx context.Context, dgst digest.Digest) (content.Info, error) {
        p, err := s.blobPath(dgst)
        if err != nil {
                return content.Info{}, fmt.Errorf("calculating blob info path: %w", err)
        }

        fi, err := os.Stat(p)
        if err != nil {
                if os.IsNotExist(err) {
                        err = fmt.Errorf("content %v: %w", dgst, errdefs.ErrNotFound)
                }

                return content.Info{}, err
        }
        var labels map[string]string
        if s.ls != nil {
                labels, err = s.ls.Get(dgst)
                if err != nil {
                        return content.Info{}, err
                }
        }
        return s.info(dgst, fi, labels), nil
}

func (s *store) info(dgst digest.Digest, fi os.FileInfo, labels map[string]string) content.Info {
        return content.Info{
                Digest:    dgst,
                Size:      fi.Size(),
                CreatedAt: fi.ModTime(),
                UpdatedAt: getATime(fi),
                Labels:    labels,
        }
}

// ReaderAt returns an io.ReaderAt for the blob.
func (s *store) ReaderAt(ctx context.Context, desc ocispec.Descriptor) (content.ReaderAt, error) {
        p, err := s.blobPath(desc.Digest)
        if err != nil {
                return nil, fmt.Errorf("calculating blob path for ReaderAt: %w", err)
        }

        reader, err := OpenReader(p)
        if err != nil {
                return nil, fmt.Errorf("blob %s expected at %s: %w", desc.Digest, p, err)
        }

        return reader, nil
}

// Delete removes a blob by its digest.
//
// While this is safe to do concurrently, safe exist-removal logic must hold
// some global lock on the store.
func (s *store) Delete(ctx context.Context, dgst digest.Digest) error {
        bp, err := s.blobPath(dgst)
        if err != nil {
                return fmt.Errorf("calculating blob path for delete: %w", err)
        }

        if err := os.RemoveAll(bp); err != nil {
                if !os.IsNotExist(err) {
                        return err
                }

                return fmt.Errorf("content %v: %w", dgst, errdefs.ErrNotFound)
        }

        return nil
}

func (s *store) Update(ctx context.Context, info content.Info, fieldpaths ...string) (content.Info, error) {
        if s.ls == nil {
                return content.Info{}, fmt.Errorf("update not supported on immutable content store: %w", errdefs.ErrFailedPrecondition)
        }

        p, err := s.blobPath(info.Digest)
        if err != nil {
                return content.Info{}, fmt.Errorf("calculating blob path for update: %w", err)
        }

        fi, err := os.Stat(p)
        if err != nil {
                if os.IsNotExist(err) {
                        err = fmt.Errorf("content %v: %w", info.Digest, errdefs.ErrNotFound)
                }

                return content.Info{}, err
        }

        var (
                all    bool
                labels map[string]string
        )
        if len(fieldpaths) > 0 {
                for _, path := range fieldpaths {
                        if strings.HasPrefix(path, "labels.") {
                                if labels == nil {
                                        labels = map[string]string{}
                                }

                                key := strings.TrimPrefix(path, "labels.")
                                labels[key] = info.Labels[key]
                                continue
                        }

                        switch path {
                        case "labels":
                                all = true
                                labels = info.Labels
                        default:
                                return content.Info{}, fmt.Errorf("cannot update %q field on content info %q: %w", path, info.Digest, errdefs.ErrInvalidArgument)
                        }
                }
        } else {
                all = true
                labels = info.Labels
        }

        if all {
                err = s.ls.Set(info.Digest, labels)
        } else {
                labels, err = s.ls.Update(info.Digest, labels)
        }
        if err != nil {
                return content.Info{}, err
        }

        info = s.info(info.Digest, fi, labels)
        info.UpdatedAt = time.Now()

        if err := os.Chtimes(p, info.UpdatedAt, info.CreatedAt); err != nil {
                log.G(ctx).WithError(err).Warnf("could not change access time for %s", info.Digest)
        }

        return info, nil
}

func (s *store) Walk(ctx context.Context, fn content.WalkFunc, fs ...string) error {
        root := filepath.Join(s.root, "blobs")

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return err
        }

        var alg digest.Algorithm
        return filepath.Walk(root, func(path string, fi os.FileInfo, err error) error {
                if err != nil {
                        return err
                }
                if !fi.IsDir() && !alg.Available() {
                        return nil
                }

                // TODO(stevvooe): There are few more cases with subdirs that should be
                // handled in case the layout gets corrupted. This isn't strict enough
                // and may spew bad data.

                if path == root {
                        return nil
                }
                if filepath.Dir(path) == root {
                        alg = digest.Algorithm(filepath.Base(path))

                        if !alg.Available() {
                                alg = ""
                                return filepath.SkipDir
                        }

                        // descending into a hash directory
                        return nil
                }

                dgst := digest.NewDigestFromEncoded(alg, filepath.Base(path))
                if err := dgst.Validate(); err != nil {
                        // log error but don't report
                        log.L.WithError(err).WithField("path", path).Error("invalid digest for blob path")
                        // if we see this, it could mean some sort of corruption of the
                        // store or extra paths not expected previously.
                }

                var labels map[string]string
                if s.ls != nil {
                        labels, err = s.ls.Get(dgst)
                        if err != nil {
                                return err
                        }
                }

                info := s.info(dgst, fi, labels)
                if !filter.Match(content.AdaptInfo(info)) {
                        return nil
                }
                return fn(info)
        })
}

func (s *store) Status(ctx context.Context, ref string) (content.Status, error) {
        return s.status(s.ingestRoot(ref))
}

func (s *store) ListStatuses(ctx context.Context, fs ...string) ([]content.Status, error) {
        fp, err := os.Open(filepath.Join(s.root, "ingest"))
        if err != nil {
                return nil, err
        }
        defer fp.Close()

        fis, err := fp.Readdirnames(-1)
        if err != nil {
                return nil, err
        }

        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return nil, err
        }

        var active []content.Status
        for _, fi := range fis {
                p := filepath.Join(s.root, "ingest", fi)
                stat, err := s.status(p)
                if err != nil {
                        if !os.IsNotExist(err) {
                                return nil, err
                        }

                        // TODO(stevvooe): This is a common error if uploads are being
                        // completed while making this listing. Need to consider taking a
                        // lock on the whole store to coordinate this aspect.
                        //
                        // Another option is to cleanup downloads asynchronously and
                        // coordinate this method with the cleanup process.
                        //
                        // For now, we just skip them, as they really don't exist.
                        continue
                }

                if filter.Match(adaptStatus(stat)) {
                        active = append(active, stat)
                }
        }

        return active, nil
}

// WalkStatusRefs is used to walk all status references
// Failed status reads will be logged and ignored, if
// this function is called while references are being altered,
// these error messages may be produced.
func (s *store) WalkStatusRefs(ctx context.Context, fn func(string) error) error {
        fp, err := os.Open(filepath.Join(s.root, "ingest"))
        if err != nil {
                return err
        }
        defer fp.Close()

        fis, err := fp.Readdirnames(-1)
        if err != nil {
                return err
        }

        for _, fi := range fis {
                rf := filepath.Join(s.root, "ingest", fi, "ref")

                ref, err := readFileString(rf)
                if err != nil {
                        log.G(ctx).WithError(err).WithField("path", rf).Error("failed to read ingest ref")
                        continue
                }

                if err := fn(ref); err != nil {
                        return err
                }
        }

        return nil
}

// status works like stat above except uses the path to the ingest.
func (s *store) status(ingestPath string) (content.Status, error) {
        dp := filepath.Join(ingestPath, "data")
        fi, err := os.Stat(dp)
        if err != nil {
                if os.IsNotExist(err) {
                        err = fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound)
                }
                return content.Status{}, err
        }

        ref, err := readFileString(filepath.Join(ingestPath, "ref"))
        if err != nil {
                if os.IsNotExist(err) {
                        err = fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound)
                }
                return content.Status{}, err
        }

        startedAt, err := readFileTimestamp(filepath.Join(ingestPath, "startedat"))
        if err != nil {
                return content.Status{}, fmt.Errorf("could not read startedat: %w", err)
        }

        updatedAt, err := readFileTimestamp(filepath.Join(ingestPath, "updatedat"))
        if err != nil {
                return content.Status{}, fmt.Errorf("could not read updatedat: %w", err)
        }

        // because we don't write updatedat on every write, the mod time may
        // actually be more up to date.
        if fi.ModTime().After(updatedAt) {
                updatedAt = fi.ModTime()
        }

        return content.Status{
                Ref:       ref,
                Offset:    fi.Size(),
                Total:     s.total(ingestPath),
                UpdatedAt: updatedAt,
                StartedAt: startedAt,
        }, nil
}

func adaptStatus(status content.Status) filters.Adaptor {
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }
                switch fieldpath[0] {
                case "ref":
                        return status.Ref, true
                }

                return "", false
        })
}

// total attempts to resolve the total expected size for the write.
func (s *store) total(ingestPath string) int64 {
        totalS, err := readFileString(filepath.Join(ingestPath, "total"))
        if err != nil {
                return 0
        }

        total, err := strconv.ParseInt(totalS, 10, 64)
        if err != nil {
                // represents a corrupted file, should probably remove.
                return 0
        }

        return total
}

// Writer begins or resumes the active writer identified by ref. If the writer
// is already in use, an error is returned. Only one writer may be in use per
// ref at a time.
//
// The argument `ref` is used to uniquely identify a long-lived writer transaction.
func (s *store) Writer(ctx context.Context, opts ...content.WriterOpt) (content.Writer, error) {
        var wOpts content.WriterOpts
        for _, opt := range opts {
                if err := opt(&wOpts); err != nil {
                        return nil, err
                }
        }
        // TODO(AkihiroSuda): we could create a random string or one calculated based on the context
        // https://github.com/containerd/containerd/issues/2129#issuecomment-380255019
        if wOpts.Ref == "" {
                return nil, fmt.Errorf("ref must not be empty: %w", errdefs.ErrInvalidArgument)
        }

        if err := tryLock(wOpts.Ref); err != nil {
                return nil, err
        }

        w, err := s.writer(ctx, wOpts.Ref, wOpts.Desc.Size, wOpts.Desc.Digest)
        if err != nil {
                unlock(wOpts.Ref)
                return nil, err
        }

        return w, nil // lock is now held by w.
}

func (s *store) resumeStatus(ref string, total int64, digester digest.Digester) (content.Status, error) {
        path, _, data := s.ingestPaths(ref)
        status, err := s.status(path)
        if err != nil {
                return status, fmt.Errorf("failed reading status of resume write: %w", err)
        }
        if ref != status.Ref {
                // NOTE(stevvooe): This is fairly catastrophic. Either we have some
                // layout corruption or a hash collision for the ref key.
                return status, fmt.Errorf("ref key does not match: %v != %v", ref, status.Ref)
        }

        if total > 0 && status.Total > 0 && total != status.Total {
                return status, fmt.Errorf("provided total differs from status: %v != %v", total, status.Total)
        }

        //nolint:dupword
        // TODO(stevvooe): slow slow slow!!, send to goroutine or use resumable hashes
        fp, err := os.Open(data)
        if err != nil {
                return status, err
        }

        p := bufPool.Get().(*[]byte)
        status.Offset, err = io.CopyBuffer(digester.Hash(), fp, *p)
        bufPool.Put(p)
        fp.Close()
        return status, err
}

// writer provides the main implementation of the Writer method. The caller
// must hold the lock correctly and release on error if there is a problem.
func (s *store) writer(ctx context.Context, ref string, total int64, expected digest.Digest) (content.Writer, error) {
        // TODO(stevvooe): Need to actually store expected here. We have
        // code in the service that shouldn't be dealing with this.
        if expected != "" {
                p, err := s.blobPath(expected)
                if err != nil {
                        return nil, fmt.Errorf("calculating expected blob path for writer: %w", err)
                }
                if _, err := os.Stat(p); err == nil {
                        return nil, fmt.Errorf("content %v: %w", expected, errdefs.ErrAlreadyExists)
                }
        }

        path, refp, data := s.ingestPaths(ref)

        var (
                digester  = digest.Canonical.Digester()
                offset    int64
                startedAt time.Time
                updatedAt time.Time
        )

        foundValidIngest := false
        // ensure that the ingest path has been created.
        if err := os.Mkdir(path, 0755); err != nil {
                if !os.IsExist(err) {
                        return nil, err
                }
                status, err := s.resumeStatus(ref, total, digester)
                if err == nil {
                        foundValidIngest = true
                        updatedAt = status.UpdatedAt
                        startedAt = status.StartedAt
                        total = status.Total
                        offset = status.Offset
                } else {
                        log.G(ctx).Infof("failed to resume the status from path %s: %s. will recreate them", path, err.Error())
                }
        }

        if !foundValidIngest {
                startedAt = time.Now()
                updatedAt = startedAt

                // the ingest is new, we need to setup the target location.
                // write the ref to a file for later use
                if err := os.WriteFile(refp, []byte(ref), 0666); err != nil {
                        return nil, err
                }

                if err := writeTimestampFile(filepath.Join(path, "startedat"), startedAt); err != nil {
                        return nil, err
                }

                if err := writeTimestampFile(filepath.Join(path, "updatedat"), startedAt); err != nil {
                        return nil, err
                }

                if total > 0 {
                        if err := os.WriteFile(filepath.Join(path, "total"), []byte(fmt.Sprint(total)), 0666); err != nil {
                                return nil, err
                        }
                }
        }

        fp, err := os.OpenFile(data, os.O_WRONLY|os.O_CREATE, 0666)
        if err != nil {
                return nil, fmt.Errorf("failed to open data file: %w", err)
        }

        if _, err := fp.Seek(offset, io.SeekStart); err != nil {
                fp.Close()
                return nil, fmt.Errorf("could not seek to current write offset: %w", err)
        }

        return &writer{
                s:         s,
                fp:        fp,
                ref:       ref,
                path:      path,
                offset:    offset,
                total:     total,
                digester:  digester,
                startedAt: startedAt,
                updatedAt: updatedAt,
        }, nil
}

// Abort an active transaction keyed by ref. If the ingest is active, it will
// be cancelled. Any resources associated with the ingest will be cleaned.
func (s *store) Abort(ctx context.Context, ref string) error {
        root := s.ingestRoot(ref)
        if err := os.RemoveAll(root); err != nil {
                if os.IsNotExist(err) {
                        return fmt.Errorf("ingest ref %q: %w", ref, errdefs.ErrNotFound)
                }

                return err
        }

        return nil
}

func (s *store) blobPath(dgst digest.Digest) (string, error) {
        if err := dgst.Validate(); err != nil {
                return "", fmt.Errorf("cannot calculate blob path from invalid digest: %v: %w", err, errdefs.ErrInvalidArgument)
        }

        return filepath.Join(s.root, "blobs", dgst.Algorithm().String(), dgst.Encoded()), nil
}

func (s *store) ingestRoot(ref string) string {
        // we take a digest of the ref to keep the ingest paths constant length.
        // Note that this is not the current or potential digest of incoming content.
        dgst := digest.FromString(ref)
        return filepath.Join(s.root, "ingest", dgst.Encoded())
}

// ingestPaths are returned. The paths are the following:
//
// - root: entire ingest directory
// - ref: name of the starting ref, must be unique
// - data: file where data is written
func (s *store) ingestPaths(ref string) (string, string, string) {
        var (
                fp = s.ingestRoot(ref)
                rp = filepath.Join(fp, "ref")
                dp = filepath.Join(fp, "data")
        )

        return fp, rp, dp
}

func readFileString(path string) (string, error) {
        p, err := os.ReadFile(path)
        return string(p), err
}

// readFileTimestamp reads a file with just a timestamp present.
func readFileTimestamp(p string) (time.Time, error) {
        b, err := os.ReadFile(p)
        if err != nil {
                if os.IsNotExist(err) {
                        err = fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound)
                }
                return time.Time{}, err
        }

        var t time.Time
        if err := t.UnmarshalText(b); err != nil {
                return time.Time{}, fmt.Errorf("could not parse timestamp file %v: %w", p, err)
        }

        return t, nil
}

func writeTimestampFile(p string, t time.Time) error {
        b, err := t.MarshalText()
        if err != nil {
                return err
        }
        return writeToCompletion(p, b, 0666)
}

func writeToCompletion(path string, data []byte, mode os.FileMode) error {
        tmp := fmt.Sprintf("%s.tmp", path)
        f, err := os.OpenFile(tmp, os.O_RDWR|os.O_CREATE|os.O_TRUNC|os.O_SYNC, mode)
        if err != nil {
                return fmt.Errorf("create tmp file: %w", err)
        }
        _, err = f.Write(data)
        f.Close()
        if err != nil {
                return fmt.Errorf("write tmp file: %w", err)
        }
        err = os.Rename(tmp, path)
        if err != nil {
                return fmt.Errorf("rename tmp file: %w", err)
        }
        return nil
}

//go:build linux || solaris

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "os"
        "syscall"
        "time"
)

func getATime(fi os.FileInfo) time.Time {
        if st, ok := fi.Sys().(*syscall.Stat_t); ok {
                return time.Unix(st.Atim.Unix())
        }

        return fi.ModTime()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "testing"

        "github.com/containerd/containerd/v2/core/content"
)

func contentStoreEnv(t testing.TB) (context.Context, string, content.Store, func()) {
        tmpdir := t.TempDir()

        cs, err := NewStore(tmpdir)
        if err != nil {
                t.Fatal(err)
        }

        ctx, cancel := context.WithCancel(context.Background())
        return ctx, tmpdir, cs, func() {
                cancel()
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package local

import (
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "runtime"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/opencontainers/go-digest"
)

// writer represents a write transaction against the blob store.
type writer struct {
        s         *store
        fp        *os.File // opened data file
        path      string   // path to writer dir
        ref       string   // ref key
        offset    int64
        total     int64
        digester  digest.Digester
        startedAt time.Time
        updatedAt time.Time
}

func (w *writer) Status() (content.Status, error) {
        return content.Status{
                Ref:       w.ref,
                Offset:    w.offset,
                Total:     w.total,
                StartedAt: w.startedAt,
                UpdatedAt: w.updatedAt,
        }, nil
}

// Digest returns the current digest of the content, up to the current write.
//
// Cannot be called concurrently with `Write`.
func (w *writer) Digest() digest.Digest {
        return w.digester.Digest()
}

// Write p to the transaction.
//
// Note that writes are unbuffered to the backing file. When writing, it is
// recommended to wrap in a bufio.Writer or, preferably, use io.CopyBuffer.
func (w *writer) Write(p []byte) (n int, err error) {
        n, err = w.fp.Write(p)
        w.digester.Hash().Write(p[:n])
        w.offset += int64(len(p))
        w.updatedAt = time.Now()
        return n, err
}

func (w *writer) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) error {
        // Ensure even on error the writer is fully closed
        defer unlock(w.ref)

        var base content.Info
        for _, opt := range opts {
                if err := opt(&base); err != nil {
                        return err
                }
        }

        fp := w.fp
        w.fp = nil

        if fp == nil {
                return fmt.Errorf("cannot commit on closed writer: %w", errdefs.ErrFailedPrecondition)
        }

        if err := fp.Sync(); err != nil {
                fp.Close()
                return fmt.Errorf("sync failed: %w", err)
        }

        fi, err := fp.Stat()
        closeErr := fp.Close()
        if err != nil {
                return fmt.Errorf("stat on ingest file failed: %w", err)
        }
        if closeErr != nil {
                return fmt.Errorf("failed to close ingest file: %w", closeErr)
        }

        if size > 0 && size != fi.Size() {
                return fmt.Errorf("unexpected commit size %d, expected %d: %w", fi.Size(), size, errdefs.ErrFailedPrecondition)
        }

        dgst := w.digester.Digest()
        if expected != "" && expected != dgst {
                return fmt.Errorf("unexpected commit digest %s, expected %s: %w", dgst, expected, errdefs.ErrFailedPrecondition)
        }

        var (
                ingest    = filepath.Join(w.path, "data")
                target, _ = w.s.blobPath(dgst) // ignore error because we calculated this dgst
        )

        // make sure parent directories of blob exist
        if err := os.MkdirAll(filepath.Dir(target), 0755); err != nil {
                return err
        }

        if _, err := os.Stat(target); err == nil {
                // collision with the target file!
                if err := os.RemoveAll(w.path); err != nil {
                        log.G(ctx).WithField("ref", w.ref).WithField("path", w.path).Error("failed to remove ingest directory")
                }
                return fmt.Errorf("content %v: %w", dgst, errdefs.ErrAlreadyExists)
        }

        if err := os.Rename(ingest, target); err != nil {
                return err
        }

        // Ingest has now been made available in the content store, attempt to complete
        // setting metadata but errors should only be logged and not returned since
        // the content store cannot be cleanly rolled back.

        commitTime := time.Now()
        if err := os.Chtimes(target, commitTime, commitTime); err != nil {
                log.G(ctx).WithField("digest", dgst).Error("failed to change file time to commit time")
        }

        // clean up!!
        if err := os.RemoveAll(w.path); err != nil {
                log.G(ctx).WithField("ref", w.ref).WithField("path", w.path).Error("failed to remove ingest directory")
        }

        if w.s.ls != nil && base.Labels != nil {
                if err := w.s.ls.Set(dgst, base.Labels); err != nil {
                        log.G(ctx).WithField("digest", dgst).Error("failed to set labels")
                }
        }

        // change to readonly, more important for read, but provides _some_
        // protection from this point on. We use the existing perms with a mask
        // only allowing reads honoring the umask on creation.
        //
        // This removes write and exec, only allowing read per the creation umask.
        //
        // NOTE: Windows does not support this operation
        if runtime.GOOS != "windows" {
                if err := os.Chmod(target, (fi.Mode()&os.ModePerm)&^0333); err != nil {
                        log.G(ctx).WithField("ref", w.ref).Error("failed to make readonly")
                }
        }

        return nil
}

// Close the writer, flushing any unwritten data and leaving the progress in
// tact.
//
// If one needs to resume the transaction, a new writer can be obtained from
// `Ingester.Writer` using the same key. The write can then be continued
// from it was left off.
//
// To abandon a transaction completely, first call close then `IngestManager.Abort` to
// clean up the associated resources.
func (w *writer) Close() (err error) {
        if w.fp != nil {
                w.fp.Sync()
                err = w.fp.Close()
                writeTimestampFile(filepath.Join(w.path, "updatedat"), w.updatedAt)
                w.fp = nil
                unlock(w.ref)
                return
        }

        return nil
}

func (w *writer) Truncate(size int64) error {
        if size != 0 {
                return errors.New("Truncate: unsupported size")
        }
        w.offset = 0
        w.digester.Hash().Reset()
        if _, err := w.fp.Seek(0, io.SeekStart); err != nil {
                return err
        }
        return w.fp.Truncate(0)
}

func (w *writer) Sync() error {
        if w.fp != nil {
                return w.fp.Sync()
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package cri

import (
        "context"
        "fmt"
        "io"

        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
        runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/sandbox"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/constants"
        "github.com/containerd/containerd/v2/internal/cri/instrument"
        "github.com/containerd/containerd/v2/internal/cri/nri"
        "github.com/containerd/containerd/v2/internal/cri/server"
        nriservice "github.com/containerd/containerd/v2/internal/nri"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services/warning"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/platforms"
)

// Register CRI service plugin
func init() {
        defaultConfig := criconfig.DefaultServerConfig()
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "cri",
                Requires: []plugin.Type{
                        plugins.CRIServicePlugin,
                        plugins.SandboxControllerPlugin,
                        plugins.NRIApiPlugin,
                        plugins.EventPlugin,
                        plugins.ServicePlugin,
                        plugins.LeasePlugin,
                        plugins.SandboxStorePlugin,
                        plugins.TransferPlugin,
                        plugins.WarningPlugin,
                },
                Config: &defaultConfig,
                ConfigMigration: func(ctx context.Context, configVersion int, pluginConfigs map[string]interface{}) error {
                        if configVersion >= version.ConfigVersion {
                                return nil
                        }
                        const pluginName = string(plugins.GRPCPlugin) + ".cri"
                        original, ok := pluginConfigs[pluginName]
                        if !ok {
                                return nil
                        }
                        src := original.(map[string]interface{})

                        // Currently only a single key migrated
                        if val, ok := src["disable_tcp_service"]; ok {
                                pluginConfigs[pluginName] = map[string]interface{}{
                                        "disable_tcp_service": val,
                                }
                        } else {
                                delete(pluginConfigs, pluginName)
                        }
                        return nil
                },
                InitFn: initCRIService,
        })
}

func initCRIService(ic *plugin.InitContext) (interface{}, error) {
        ctx := ic.Context
        config := ic.Config.(*criconfig.ServerConfig)

        // Get runtime service.
        criRuntimePlugin, err := ic.GetByID(plugins.CRIServicePlugin, "runtime")
        if err != nil {
                return nil, fmt.Errorf("unable to load CRI runtime service plugin dependency: %w", err)
        }

        // Get image service.
        criImagePlugin, err := ic.GetByID(plugins.CRIServicePlugin, "images")
        if err != nil {
                return nil, fmt.Errorf("unable to load CRI image service plugin dependency: %w", err)
        }

        if warnings, err := criconfig.ValidateServerConfig(ic.Context, config); err != nil {
                return nil, fmt.Errorf("invalid cri image config: %w", err)
        } else if len(warnings) > 0 {
                ws, err := ic.GetSingle(plugins.WarningPlugin)
                if err != nil {
                        return nil, err
                }
                warn := ws.(warning.Service)
                for _, w := range warnings {
                        warn.Emit(ic.Context, w)
                }
        }

        log.G(ctx).Info("Connect containerd service")
        client, err := containerd.New(
                "",
                containerd.WithDefaultNamespace(constants.K8sContainerdNamespace),
                containerd.WithDefaultPlatform(platforms.Default()),
                containerd.WithInMemoryServices(ic),
                containerd.WithInMemorySandboxControllers(ic),
        )
        if err != nil {
                return nil, fmt.Errorf("failed to create containerd client: %w", err)
        }

        sbControllers, err := getSandboxControllers(ic)
        if err != nil {
                return nil, fmt.Errorf("failed to get sandbox controllers from plugins %v", err)
        }

        streamingConfig, err := config.StreamingConfig()
        if err != nil {
                return nil, fmt.Errorf("failed to get streaming config: %w", err)
        }

        options := &server.CRIServiceOptions{
                RuntimeService:     criRuntimePlugin.(server.RuntimeService),
                ImageService:       criImagePlugin.(server.ImageService),
                StreamingConfig:    streamingConfig,
                NRI:                getNRIAPI(ic),
                Client:             client,
                SandboxControllers: sbControllers,
        }
        is := criImagePlugin.(imageService).GRPCService()

        s, rs, err := server.NewCRIService(options)
        if err != nil {
                return nil, fmt.Errorf("failed to create CRI service: %w", err)
        }

        // RegisterReadiness() must be called after NewCRIService(): https://github.com/containerd/containerd/issues/9163
        ready := ic.RegisterReadiness()
        go func() {
                if err := s.Run(ready); err != nil {
                        log.G(ctx).WithError(err).Fatal("Failed to run CRI service")
                }
                // TODO(random-liu): Whether and how we can stop containerd.
        }()

        service := &criGRPCServer{
                RuntimeServiceServer: rs,
                ImageServiceServer:   is,
                Closer:               s, // TODO: Where is close run?
                initializer:          s,
        }

        if config.DisableTCPService {
                return service, nil
        }

        return criGRPCServerWithTCP{service}, nil
}

type imageService interface {
        GRPCService() runtime.ImageServiceServer
}

type initializer interface {
        IsInitialized() bool
}

type criGRPCServer struct {
        runtime.RuntimeServiceServer
        runtime.ImageServiceServer
        io.Closer
        initializer
}

func (c *criGRPCServer) register(s *grpc.Server) error {
        instrumented := instrument.NewService(c)
        runtime.RegisterRuntimeServiceServer(s, instrumented)
        runtime.RegisterImageServiceServer(s, instrumented)
        return nil
}

// Register registers all required services onto a specific grpc server.
// This is used by containerd cri plugin.
func (c *criGRPCServer) Register(s *grpc.Server) error {
        return c.register(s)
}

type criGRPCServerWithTCP struct {
        *criGRPCServer
}

// RegisterTCP register all required services onto a GRPC server on TCP.
// This is used by containerd CRI plugin.
func (c criGRPCServerWithTCP) RegisterTCP(s *grpc.Server) error {
        return c.register(s)
}

// Get the NRI plugin, and set up our NRI API for it.
func getNRIAPI(ic *plugin.InitContext) *nri.API {
        const (
                pluginType = plugins.NRIApiPlugin
                pluginName = "nri"
        )

        ctx := ic.Context

        p, err := ic.GetByID(pluginType, pluginName)
        if err != nil {
                log.G(ctx).Info("NRI service not found, NRI support disabled")
                return nil
        }

        api, ok := p.(nriservice.API)
        if !ok {
                log.G(ctx).Infof("NRI plugin (%s, %q) has incorrect type %T, NRI support disabled",
                        pluginType, pluginName, api)
                return nil
        }

        log.G(ctx).Info("using experimental NRI integration - disable nri plugin to prevent this")

        return nri.NewAPI(api)
}

func getSandboxControllers(ic *plugin.InitContext) (map[string]sandbox.Controller, error) {
        sandboxers, err := ic.GetByType(plugins.SandboxControllerPlugin)
        if err != nil {
                return nil, err
        }
        sc := make(map[string]sandbox.Controller)
        for name, p := range sandboxers {
                sc[name] = p.(sandbox.Controller)
        }
        return sc, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"
        "fmt"
        "path/filepath"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/snapshots"
        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/constants"
        "github.com/containerd/containerd/v2/internal/cri/server/images"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services/warning"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        config := criconfig.DefaultImageConfig()

        registry.Register(&plugin.Registration{
                Type:   plugins.CRIServicePlugin,
                ID:     "images",
                Config: &config,
                Requires: []plugin.Type{
                        plugins.LeasePlugin,
                        plugins.MetadataPlugin,
                        plugins.SandboxStorePlugin,
                        plugins.ServicePlugin,  // For client
                        plugins.SnapshotPlugin, // For root directory properties
                        plugins.WarningPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        mdb := m.(*metadata.DB)

                        if warnings, err := criconfig.ValidateImageConfig(ic.Context, &config); err != nil {
                                return nil, fmt.Errorf("invalid cri image config: %w", err)
                        } else if len(warnings) > 0 {
                                ws, err := ic.GetSingle(plugins.WarningPlugin)
                                if err != nil {
                                        return nil, err
                                }
                                warn := ws.(warning.Service)
                                for _, w := range warnings {
                                        warn.Emit(ic.Context, w)
                                }
                        }

                        options := &images.CRIImageServiceOptions{
                                Content:          mdb.ContentStore(),
                                Images:           metadata.NewImageStore(mdb),
                                RuntimePlatforms: map[string]images.ImagePlatform{},
                                Snapshotters:     map[string]snapshots.Snapshotter{},
                                ImageFSPaths:     map[string]string{},
                        }

                        options.Client, err = containerd.New(
                                "",
                                containerd.WithDefaultNamespace(constants.K8sContainerdNamespace),
                                containerd.WithDefaultPlatform(platforms.Default()),
                                containerd.WithInMemoryServices(ic),
                        )
                        if err != nil {
                                return nil, fmt.Errorf("unable to init client for cri image service: %w", err)
                        }

                        allSnapshotters := mdb.Snapshotters()
                        defaultSnapshotter := config.Snapshotter
                        if s, ok := allSnapshotters[defaultSnapshotter]; ok {
                                options.Snapshotters[defaultSnapshotter] = s
                        } else {
                                return nil, fmt.Errorf("failed to find snapshotter %q", defaultSnapshotter)
                        }
                        var snapshotRoot string
                        if plugin := ic.Plugins().Get(plugins.SnapshotPlugin, defaultSnapshotter); plugin != nil {
                                snapshotRoot = plugin.Meta.Exports["root"]
                        }
                        if snapshotRoot == "" {
                                // Try a root in the same parent as this plugin
                                snapshotRoot = filepath.Join(filepath.Dir(ic.Properties[plugins.PropertyRootDir]), plugins.SnapshotPlugin.String()+"."+defaultSnapshotter)
                        }
                        options.ImageFSPaths[defaultSnapshotter] = snapshotRoot
                        log.L.Infof("Get image filesystem path %q for snapshotter %q", snapshotRoot, defaultSnapshotter)

                        for runtimeName, rp := range config.RuntimePlatforms {
                                snapshotter := rp.Snapshotter
                                if snapshotter == "" {
                                        snapshotter = defaultSnapshotter
                                } else if _, ok := options.ImageFSPaths[snapshotter]; !ok {
                                        if s, ok := options.Snapshotters[defaultSnapshotter]; ok {
                                                options.Snapshotters[defaultSnapshotter] = s
                                        } else {
                                                return nil, fmt.Errorf("failed to find snapshotter %q", defaultSnapshotter)
                                        }
                                        var snapshotRoot string
                                        if plugin := ic.Plugins().Get(plugins.SnapshotPlugin, snapshotter); plugin != nil {
                                                snapshotRoot = plugin.Meta.Exports["root"]
                                        }
                                        if snapshotRoot == "" {
                                                // Try a root in the same parent as this plugin
                                                snapshotRoot = filepath.Join(filepath.Dir(ic.Properties[plugins.PropertyRootDir]), plugins.SnapshotPlugin.String()+"."+snapshotter)
                                        }

                                        options.ImageFSPaths[defaultSnapshotter] = snapshotRoot
                                        log.L.Infof("Get image filesystem path %q for snapshotter %q", options.ImageFSPaths[snapshotter], snapshotter)
                                }
                                platform := platforms.DefaultSpec()
                                if rp.Platform != "" {
                                        p, err := platforms.Parse(rp.Platform)
                                        if err != nil {
                                                return nil, fmt.Errorf("unable to parse platform %q: %w", rp.Platform, err)
                                        }
                                        platform = p
                                }
                                options.RuntimePlatforms[runtimeName] = images.ImagePlatform{
                                        Snapshotter: snapshotter,
                                        Platform:    platform,
                                }
                        }

                        service, err := images.NewService(config, options)
                        if err != nil {
                                return nil, fmt.Errorf("failed to create image service: %w", err)
                        }

                        return service, nil
                },
                ConfigMigration: configMigration,
        })
}

func configMigration(ctx context.Context, configVersion int, pluginConfigs map[string]interface{}) error {
        if configVersion >= version.ConfigVersion {
                return nil
        }
        original, ok := pluginConfigs[string(plugins.GRPCPlugin)+".cri"]
        if !ok {
                return nil
        }
        src := original.(map[string]interface{})
        updated, ok := pluginConfigs[string(plugins.CRIServicePlugin)+".images"]
        var dst map[string]interface{}
        if ok {
                dst = updated.(map[string]interface{})
        } else {
                dst = map[string]interface{}{}
        }

        migrateConfig(dst, src)
        pluginConfigs[string(plugins.CRIServicePlugin)+".images"] = dst
        return nil
}
func migrateConfig(dst, src map[string]interface{}) {
        var pinnedImages map[string]interface{}
        if v, ok := dst["pinned_images"]; ok {
                pinnedImages = v.(map[string]interface{})
        } else {
                pinnedImages = map[string]interface{}{}
        }

        if simage, ok := src["sandbox_image"]; ok {
                pinnedImages["sandbox"] = simage
        }
        if len(pinnedImages) > 0 {
                dst["pinned_images"] = pinnedImages
        }

        for _, key := range []string{
                "registry",
                "image_decryption",
                "max_concurrent_downloads",
                "image_pull_progress_timeout",
                "image_pull_with_sync_fs",
                "stats_collect_period",
        } {
                if val, ok := src[key]; ok {
                        dst[key] = val
                }
        }

        containerdConf, ok := src["containerd"]
        if !ok {
                return
        }
        containerdConfMap := containerdConf.(map[string]interface{})
        runtimesConf, ok := containerdConfMap["runtimes"]
        if !ok {
                return
        }

        var runtimePlatforms map[string]interface{}
        if v, ok := dst["runtime_platform"]; ok {
                runtimePlatforms = v.(map[string]interface{})
        } else {
                runtimePlatforms = map[string]interface{}{}
        }
        for runtime, v := range runtimesConf.(map[string]interface{}) {
                runtimeConf := v.(map[string]interface{})
                if snapshotter, ok := runtimeConf["snapshot"]; ok && snapshotter != "" {
                        runtimePlatforms[runtime] = map[string]interface{}{
                                "platform":    platforms.DefaultStrict(),
                                "snapshotter": snapshotter,
                        }
                }
        }
        if len(runtimePlatforms) > 0 {
                dst["runtime_platform"] = runtimePlatforms
        }

        for _, key := range []string{
                "snapshotter",
                "disable_snapshot_annotations",
                "discard_unpacked_layers",
        } {
                if val, ok := containerdConfMap[key]; ok {
                        dst[key] = val
                }
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package runtime

import (
        "context"
        "encoding/json"
        "flag"
        "fmt"
        "os"
        "path/filepath"

        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        imagespec "github.com/opencontainers/image-spec/specs-go/v1"
        "k8s.io/klog/v2"

        criconfig "github.com/containerd/containerd/v2/internal/cri/config"
        "github.com/containerd/containerd/v2/internal/cri/constants"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services/warning"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/errdefs"
        "github.com/containerd/platforms"
)

func init() {
        config := criconfig.DefaultRuntimeConfig()

        // Base plugin that other CRI services depend on.
        registry.Register(&plugin.Registration{
                Type:   plugins.CRIServicePlugin,
                ID:     "runtime",
                Config: &config,
                Requires: []plugin.Type{
                        plugins.WarningPlugin,
                },
                ConfigMigration: func(ctx context.Context, configVersion int, pluginConfigs map[string]interface{}) error {
                        if configVersion >= version.ConfigVersion {
                                return nil
                        }
                        c, ok := pluginConfigs[string(plugins.GRPCPlugin)+".cri"]
                        if !ok {
                                return nil
                        }
                        conf := c.(map[string]interface{})
                        migrateConfig(conf)
                        pluginConfigs[string(plugins.CRIServicePlugin)+".runtime"] = conf
                        return nil
                },
                InitFn: initCRIRuntime,
        })
}

func initCRIRuntime(ic *plugin.InitContext) (interface{}, error) {
        ic.Meta.Platforms = []imagespec.Platform{platforms.DefaultSpec()}
        ic.Meta.Exports = map[string]string{"CRIVersion": constants.CRIVersion}
        ctx := ic.Context
        pluginConfig := ic.Config.(*criconfig.RuntimeConfig)
        if warnings, err := criconfig.ValidateRuntimeConfig(ctx, pluginConfig); err != nil {
                return nil, fmt.Errorf("invalid plugin config: %w", err)
        } else if len(warnings) > 0 {
                ws, err := ic.GetSingle(plugins.WarningPlugin)
                if err != nil {
                        return nil, err
                }
                warn := ws.(warning.Service)
                for _, w := range warnings {
                        warn.Emit(ctx, w)
                }
        }

        // For backward compatibility, we have to keep the rootDir and stateDir the same as before.
        containerdRootDir := filepath.Dir(ic.Properties[plugins.PropertyRootDir])
        rootDir := filepath.Join(containerdRootDir, "io.containerd.grpc.v1.cri")
        containerdStateDir := filepath.Dir(ic.Properties[plugins.PropertyStateDir])
        stateDir := filepath.Join(containerdStateDir, "io.containerd.grpc.v1.cri")
        c := criconfig.Config{
                RuntimeConfig:      *pluginConfig,
                ContainerdRootDir:  containerdRootDir,
                ContainerdEndpoint: ic.Properties[plugins.PropertyGRPCAddress],
                RootDir:            rootDir,
                StateDir:           stateDir,
        }

        // Ignoring errors here; this should never fail.
        cfg, _ := json.Marshal(c)
        log.G(ctx).WithFields(log.Fields{"config": string(cfg)}).Info("starting cri plugin")

        if err := setGLogLevel(); err != nil {
                return nil, fmt.Errorf("failed to set glog level: %w", err)
        }

        ociSpec, err := loadBaseOCISpecs(&c)
        if err != nil {
                return nil, fmt.Errorf("failed to create load basic oci spec: %w", err)
        }

        return &runtime{
                config:       c,
                baseOCISpecs: ociSpec,
        }, nil
}

// runtime contains common dependencies for CRI's runtime, image, and podsandbox services.
type runtime struct {
        // Config contains all configurations.
        config criconfig.Config
        // BaseOCISpecs contains cached OCI specs loaded via `Runtime.BaseRuntimeSpec`
        baseOCISpecs map[string]*oci.Spec
}

func (r *runtime) Config() criconfig.Config {
        return r.config
}

func (r *runtime) LoadOCISpec(filename string) (*oci.Spec, error) {
        spec, ok := r.baseOCISpecs[filename]
        if !ok {
                // TODO: Load here or only allow preloading...
                return nil, errdefs.ErrNotFound
        }
        return spec, nil
}

func loadBaseOCISpecs(config *criconfig.Config) (map[string]*oci.Spec, error) {
        specs := map[string]*oci.Spec{}
        for _, cfg := range config.Runtimes {
                if cfg.BaseRuntimeSpec == "" {
                        continue
                }

                // Don't load same file twice
                if _, ok := specs[cfg.BaseRuntimeSpec]; ok {
                        continue
                }

                spec, err := loadOCISpec(cfg.BaseRuntimeSpec)
                if err != nil {
                        return nil, fmt.Errorf("failed to load base OCI spec from file: %s: %w", cfg.BaseRuntimeSpec, err)
                }

                specs[cfg.BaseRuntimeSpec] = spec
        }

        return specs, nil
}

func loadOCISpec(filename string) (*oci.Spec, error) {
        file, err := os.Open(filename)
        if err != nil {
                return nil, fmt.Errorf("failed to open base OCI spec: %s: %w", filename, err)
        }
        defer file.Close()

        spec := oci.Spec{}
        if err := json.NewDecoder(file).Decode(&spec); err != nil {
                return nil, fmt.Errorf("failed to parse base OCI spec file: %w", err)
        }

        return &spec, nil
}

// Set glog level.
func setGLogLevel() error {
        l := log.GetLevel()
        fs := flag.NewFlagSet("klog", flag.PanicOnError)
        klog.InitFlags(fs)
        if err := fs.Set("logtostderr", "true"); err != nil {
                return err
        }
        switch l {
        case log.TraceLevel:
                return fs.Set("v", "5")
        case log.DebugLevel:
                return fs.Set("v", "4")
        case log.InfoLevel:
                return fs.Set("v", "2")
        default:
                // glog doesn't support other filters. Defaults to v=0.
        }
        return nil
}

func migrateConfig(conf map[string]interface{}) {
        containerdConf, ok := conf["containerd"]
        if !ok {
                return
        }
        runtimesConf, ok := containerdConf.(map[string]interface{})["runtimes"]
        if !ok {
                return
        }
        for _, v := range runtimesConf.(map[string]interface{}) {
                runtimeConf := v.(map[string]interface{})
                if sandboxMode, ok := runtimeConf["sandbox_mode"]; ok {
                        if _, ok := runtimeConf["sandboxer"]; !ok {
                                runtimeConf["sandboxer"] = sandboxMode
                        }
                }
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package walking

import (
        "context"
        "crypto/rand"
        "encoding/base64"
        "errors"
        "fmt"
        "io"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/archive"
        "github.com/containerd/containerd/v2/pkg/archive/compression"
        "github.com/containerd/containerd/v2/pkg/epoch"
        "github.com/containerd/containerd/v2/pkg/labels"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type walkingDiff struct {
        store content.Store
}

var emptyDesc = ocispec.Descriptor{}

// NewWalkingDiff is a generic implementation of diff.Comparer.  The diff is
// calculated by mounting both the upper and lower mount sets and walking the
// mounted directories concurrently. Changes are calculated by comparing files
// against each other or by comparing file existence between directories.
// NewWalkingDiff uses no special characteristics of the mount sets and is
// expected to work with any filesystem.
func NewWalkingDiff(store content.Store) diff.Comparer {
        return &walkingDiff{
                store: store,
        }
}

// Compare creates a diff between the given mounts and uploads the result
// to the content store.
func (s *walkingDiff) Compare(ctx context.Context, lower, upper []mount.Mount, opts ...diff.Opt) (d ocispec.Descriptor, err error) {
        var config diff.Config
        for _, opt := range opts {
                if err := opt(&config); err != nil {
                        return emptyDesc, err
                }
        }
        if tm := epoch.FromContext(ctx); tm != nil && config.SourceDateEpoch == nil {
                config.SourceDateEpoch = tm
        }

        var writeDiffOpts []archive.WriteDiffOpt
        if config.SourceDateEpoch != nil {
                writeDiffOpts = append(writeDiffOpts, archive.WithSourceDateEpoch(config.SourceDateEpoch))
        }

        var isCompressed bool
        if config.Compressor != nil {
                if config.MediaType == "" {
                        return emptyDesc, errors.New("media type must be explicitly specified when using custom compressor")
                }
                isCompressed = true
        } else {
                if config.MediaType == "" {
                        config.MediaType = ocispec.MediaTypeImageLayerGzip
                }

                switch config.MediaType {
                case ocispec.MediaTypeImageLayer:
                case ocispec.MediaTypeImageLayerGzip:
                        isCompressed = true
                default:
                        return emptyDesc, fmt.Errorf("unsupported diff media type: %v: %w", config.MediaType, errdefs.ErrNotImplemented)
                }
        }

        var ocidesc ocispec.Descriptor
        if err := mount.WithTempMount(ctx, lower, func(lowerRoot string) error {
                return mount.WithReadonlyTempMount(ctx, upper, func(upperRoot string) error {
                        var newReference bool
                        if config.Reference == "" {
                                newReference = true
                                config.Reference = uniqueRef()
                        }

                        cw, err := s.store.Writer(ctx,
                                content.WithRef(config.Reference),
                                content.WithDescriptor(ocispec.Descriptor{
                                        MediaType: config.MediaType, // most contentstore implementations just ignore this
                                }))
                        if err != nil {
                                return fmt.Errorf("failed to open writer: %w", err)
                        }

                        // errOpen is set when an error occurs while the content writer has not been
                        // committed or closed yet to force a cleanup
                        var errOpen error
                        defer func() {
                                if errOpen != nil {
                                        cw.Close()
                                        if newReference {
                                                if abortErr := s.store.Abort(ctx, config.Reference); abortErr != nil {
                                                        log.G(ctx).WithError(abortErr).WithField("ref", config.Reference).Warnf("failed to delete diff upload")
                                                }
                                        }
                                }
                        }()
                        if !newReference {
                                if errOpen = cw.Truncate(0); errOpen != nil {
                                        return errOpen
                                }
                        }

                        if isCompressed {
                                dgstr := digest.SHA256.Digester()
                                var compressed io.WriteCloser
                                if config.Compressor != nil {
                                        compressed, errOpen = config.Compressor(cw, config.MediaType)
                                        if errOpen != nil {
                                                return fmt.Errorf("failed to get compressed stream: %w", errOpen)
                                        }
                                } else {
                                        compressed, errOpen = compression.CompressStream(cw, compression.Gzip)
                                        if errOpen != nil {
                                                return fmt.Errorf("failed to get compressed stream: %w", errOpen)
                                        }
                                }
                                errOpen = archive.WriteDiff(ctx, io.MultiWriter(compressed, dgstr.Hash()), lowerRoot, upperRoot, writeDiffOpts...)
                                compressed.Close()
                                if errOpen != nil {
                                        return fmt.Errorf("failed to write compressed diff: %w", errOpen)
                                }

                                if config.Labels == nil {
                                        config.Labels = map[string]string{}
                                }
                                config.Labels[labels.LabelUncompressed] = dgstr.Digest().String()
                        } else {
                                if errOpen = archive.WriteDiff(ctx, cw, lowerRoot, upperRoot, writeDiffOpts...); errOpen != nil {
                                        return fmt.Errorf("failed to write diff: %w", errOpen)
                                }
                        }

                        var commitopts []content.Opt
                        if config.Labels != nil {
                                commitopts = append(commitopts, content.WithLabels(config.Labels))
                        }

                        dgst := cw.Digest()
                        if errOpen = cw.Commit(ctx, 0, dgst, commitopts...); errOpen != nil {
                                if !errdefs.IsAlreadyExists(errOpen) {
                                        return fmt.Errorf("failed to commit: %w", errOpen)
                                }
                                errOpen = nil
                        }

                        info, err := s.store.Info(ctx, dgst)
                        if err != nil {
                                return fmt.Errorf("failed to get info from content store: %w", err)
                        }
                        if info.Labels == nil {
                                info.Labels = make(map[string]string)
                        }
                        // Set "containerd.io/uncompressed" label if digest already existed without label
                        if _, ok := info.Labels[labels.LabelUncompressed]; !ok {
                                info.Labels[labels.LabelUncompressed] = config.Labels[labels.LabelUncompressed]
                                if _, err := s.store.Update(ctx, info, "labels."+labels.LabelUncompressed); err != nil {
                                        return fmt.Errorf("error setting uncompressed label: %w", err)
                                }
                        }

                        ocidesc = ocispec.Descriptor{
                                MediaType: config.MediaType,
                                Size:      info.Size,
                                Digest:    info.Digest,
                        }
                        return nil
                })
        }); err != nil {
                return emptyDesc, err
        }

        return ocidesc, nil
}

func uniqueRef() string {
        t := time.Now()
        var b [3]byte
        // Ignore read failures, just decreases uniqueness
        rand.Read(b[:])
        return fmt.Sprintf("%d-%s", t.UnixNano(), base64.URLEncoding.EncodeToString(b[:]))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package plugin

import (
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/diff/apply"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/diff/walking"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.DiffPlugin,
                ID:   "walking",
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        md, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }

                        ic.Meta.Platforms = append(ic.Meta.Platforms, platforms.DefaultSpec())
                        cs := md.(*metadata.DB).ContentStore()

                        return diffPlugin{
                                Comparer: walking.NewWalkingDiff(cs),
                                Applier:  apply.NewFileSystemApplier(cs),
                        }, nil
                },
        })
}

type diffPlugin struct {
        diff.Comparer
        diff.Applier
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package events

import (
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.EventPlugin,
                ID:   "exchange",
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        return exchange.NewExchange(), nil
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package scheduler

import "github.com/docker/go-metrics"

var (
        // collectionCounter metrics for counter of gc scheduler collections.
        collectionCounter metrics.LabeledCounter

        // gcTimeHist histogram metrics for duration of gc scheduler collections.
        gcTimeHist metrics.Timer
)

func init() {
        ns := metrics.NewNamespace("containerd", "gc", nil)
        collectionCounter = ns.NewLabeledCounter("collections", "counter of gc scheduler collections", "status")
        gcTimeHist = ns.NewTimer("gc", "duration of gc scheduler collections")
        metrics.Register(ns)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package scheduler

import (
        "context"
        "errors"
        "fmt"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/internal/tomlext"
        "github.com/containerd/containerd/v2/pkg/gc"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

// config configures the garbage collection policies.
type config struct {
        // PauseThreshold represents the maximum amount of time garbage
        // collection should be scheduled based on the average pause time.
        // For example, a value of 0.02 means that scheduled garbage collection
        // pauses should present at most 2% of real time,
        // or 20ms of every second.
        //
        // A maximum value of .5 is enforced to prevent over scheduling of the
        // garbage collector, trigger options are available to run in a more
        // predictable time frame after mutation.
        //
        // Default is 0.02
        PauseThreshold float64 `toml:"pause_threshold"`

        // DeletionThreshold is used to guarantee that a garbage collection is
        // scheduled after configured number of deletions have occurred
        // since the previous garbage collection. A value of 0 indicates that
        // garbage collection will not be triggered by deletion count.
        //
        // Default 0
        DeletionThreshold int `toml:"deletion_threshold"`

        // MutationThreshold is used to guarantee that a garbage collection is
        // run after a configured number of database mutations have occurred
        // since the previous garbage collection. A value of 0 indicates that
        // garbage collection will only be run after a manual trigger or
        // deletion. Unlike the deletion threshold, the mutation threshold does
        // not cause scheduling of a garbage collection, but ensures GC is run
        // at the next scheduled GC.
        //
        // Default 100
        MutationThreshold int `toml:"mutation_threshold"`

        // ScheduleDelay is the duration in the future to schedule a garbage
        // collection triggered manually or by exceeding the configured
        // threshold for deletion or mutation. A zero value will immediately
        // schedule. Use suffix "ms" for millisecond and "s" for second.
        //
        // Default is "0ms"
        ScheduleDelay tomlext.Duration `toml:"schedule_delay"`

        // StartupDelay is the delay duration to do an initial garbage
        // collection after startup. The initial garbage collection is used to
        // set the base for pause threshold and should be scheduled in the
        // future to avoid slowing down other startup processes. Use suffix
        // "ms" for millisecond and "s" for second.
        //
        // Default is "100ms"
        StartupDelay tomlext.Duration `toml:"startup_delay"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GCPlugin,
                ID:   "scheduler",
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                },
                Config: &config{
                        PauseThreshold:    0.02,
                        DeletionThreshold: 0,
                        MutationThreshold: 100,
                        ScheduleDelay:     tomlext.FromStdTime(0),
                        StartupDelay:      tomlext.FromStdTime(100 * time.Millisecond),
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        md, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }

                        mdCollector, ok := md.(collector)
                        if !ok {
                                return nil, fmt.Errorf("%s %T must implement collector", plugins.MetadataPlugin, md)
                        }

                        m := newScheduler(mdCollector, ic.Config.(*config))

                        ic.Meta.Exports = map[string]string{
                                "PauseThreshold":    fmt.Sprint(m.pauseThreshold),
                                "DeletionThreshold": fmt.Sprint(m.deletionThreshold),
                                "MutationThreshold": fmt.Sprint(m.mutationThreshold),
                                "ScheduleDelay":     fmt.Sprint(m.scheduleDelay),
                        }

                        go m.run(ic.Context)

                        return m, nil
                },
        })
}

type mutationEvent struct {
        ts       time.Time
        mutation bool
        dirty    bool
}

type collector interface {
        RegisterMutationCallback(func(bool))
        GarbageCollect(context.Context) (gc.Stats, error)
}

type gcScheduler struct {
        c collector

        eventC chan mutationEvent

        waiterL sync.Mutex
        waiters []chan gc.Stats

        pauseThreshold    float64
        deletionThreshold int
        mutationThreshold int
        scheduleDelay     time.Duration
        startupDelay      time.Duration
}

func newScheduler(c collector, cfg *config) *gcScheduler {
        eventC := make(chan mutationEvent)

        s := &gcScheduler{
                c:                 c,
                eventC:            eventC,
                pauseThreshold:    cfg.PauseThreshold,
                deletionThreshold: cfg.DeletionThreshold,
                mutationThreshold: cfg.MutationThreshold,
                scheduleDelay:     time.Duration(cfg.ScheduleDelay),
                startupDelay:      time.Duration(cfg.StartupDelay),
        }

        if s.pauseThreshold < 0.0 {
                s.pauseThreshold = 0.0
        }
        if s.pauseThreshold > 0.5 {
                s.pauseThreshold = 0.5
        }
        if s.mutationThreshold < 0 {
                s.mutationThreshold = 0
        }
        if s.scheduleDelay < 0 {
                s.scheduleDelay = 0
        }
        if s.startupDelay < 0 {
                s.startupDelay = 0
        }

        c.RegisterMutationCallback(s.mutationCallback)

        return s
}

func (s *gcScheduler) ScheduleAndWait(ctx context.Context) (gc.Stats, error) {
        return s.wait(ctx, true)
}

func (s *gcScheduler) wait(ctx context.Context, trigger bool) (gc.Stats, error) {
        wc := make(chan gc.Stats, 1)
        s.waiterL.Lock()
        s.waiters = append(s.waiters, wc)
        s.waiterL.Unlock()

        if trigger {
                e := mutationEvent{
                        ts: time.Now(),
                }
                go func() {
                        s.eventC <- e
                }()
        }

        var gcStats gc.Stats
        select {
        case stats, ok := <-wc:
                if !ok {
                        return gcStats, errors.New("gc failed")
                }
                gcStats = stats
        case <-ctx.Done():
                return gcStats, ctx.Err()
        }

        return gcStats, nil
}

func (s *gcScheduler) mutationCallback(dirty bool) {
        e := mutationEvent{
                ts:       time.Now(),
                mutation: true,
                dirty:    dirty,
        }
        go func() {
                s.eventC <- e
        }()
}

func schedule(d time.Duration) (<-chan time.Time, *time.Time) {
        next := time.Now().Add(d)
        return time.After(d), &next
}

func (s *gcScheduler) run(ctx context.Context) {
        const minimumGCTime = float64(5 * time.Millisecond)
        var (
                schedC <-chan time.Time

                lastCollection *time.Time
                nextCollection *time.Time

                interval    = time.Second
                gcTimeSum   time.Duration
                collections int

                triggered bool
                deletions int
                mutations int
        )
        if s.startupDelay > 0 {
                schedC, nextCollection = schedule(s.startupDelay)
        }
        for {
                select {
                case <-schedC:
                        // Check if garbage collection can be skipped because
                        // it is not needed or was not requested and reschedule
                        // it to attempt again after another time interval.
                        if !triggered && lastCollection != nil && deletions == 0 &&
                                (s.mutationThreshold == 0 || mutations < s.mutationThreshold) {
                                schedC, nextCollection = schedule(interval)
                                continue
                        }
                case e := <-s.eventC:
                        if lastCollection != nil && lastCollection.After(e.ts) {
                                continue
                        }
                        if e.dirty {
                                deletions++
                        }
                        if e.mutation {
                                mutations++
                        } else {
                                triggered = true
                        }

                        // Check if condition should cause immediate collection.
                        if triggered ||
                                (s.deletionThreshold > 0 && deletions >= s.deletionThreshold) ||
                                (nextCollection == nil && ((s.deletionThreshold == 0 && deletions > 0) ||
                                        (s.mutationThreshold > 0 && mutations >= s.mutationThreshold))) {
                                // Check if not already scheduled before delay threshold
                                if nextCollection == nil || nextCollection.After(time.Now().Add(s.scheduleDelay)) {
                                        // TODO(dmcg): track re-schedules for tuning schedule config
                                        schedC, nextCollection = schedule(s.scheduleDelay)
                                }
                        }

                        continue
                case <-ctx.Done():
                        return
                }

                s.waiterL.Lock()

                stats, err := s.c.GarbageCollect(ctx)
                last := time.Now()
                if err != nil {
                        log.G(ctx).WithError(err).Error("garbage collection failed")
                        collectionCounter.WithValues("fail").Inc()

                        // Reschedule garbage collection for same duration + 1 second
                        schedC, nextCollection = schedule(nextCollection.Sub(*lastCollection) + time.Second)

                        // Update last collection time even though failure occurred
                        lastCollection = &last

                        for _, w := range s.waiters {
                                close(w)
                        }
                        s.waiters = nil
                        s.waiterL.Unlock()
                        continue
                }

                gcTime := stats.Elapsed()
                gcTimeHist.Update(gcTime)
                log.G(ctx).WithField("d", gcTime).Trace("garbage collected")
                gcTimeSum += gcTime
                collections++
                collectionCounter.WithValues("success").Inc()
                triggered = false
                deletions = 0
                mutations = 0

                // Calculate new interval with updated times
                if s.pauseThreshold > 0.0 {
                        // Set interval to average gc time divided by the pause threshold
                        // This algorithm ensures that a gc is scheduled to allow enough
                        // runtime in between gc to reach the pause threshold.
                        // Pause threshold is always 0.0 < threshold <= 0.5
                        avg := float64(gcTimeSum) / float64(collections)
                        // Enforce that avg is no less than minimumGCTime
                        // to prevent immediate rescheduling
                        if avg < minimumGCTime {
                                avg = minimumGCTime
                        }
                        interval = time.Duration(avg/s.pauseThreshold - avg)
                }

                lastCollection = &last
                schedC, nextCollection = schedule(interval)

                for _, w := range s.waiters {
                        w <- stats
                }
                s.waiters = nil
                s.waiterL.Unlock()
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package imageverifier

import (
        "time"

        "github.com/containerd/containerd/v2/internal/tomlext"
        "github.com/containerd/containerd/v2/pkg/imageverifier/bindir"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

// Register default image verifier service plugin
func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.ImageVerifierPlugin,
                ID:     "bindir",
                Config: defaultConfig(),
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        cfg := ic.Config.(*bindir.Config)
                        return bindir.NewImageVerifier(cfg), nil
                },
        })
}

func defaultConfig() *bindir.Config {
        return &bindir.Config{
                BinDir:             defaultPath,
                MaxVerifiers:       10,
                PerVerifierTimeout: tomlext.FromStdTime(10 * time.Second),
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package plugin

import (
        "context"

        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/pkg/gc"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.LeasePlugin,
                ID:   "manager",
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                        plugins.GCPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        g, err := ic.GetSingle(plugins.GCPlugin)
                        if err != nil {
                                return nil, err
                        }
                        return &local{
                                Manager: metadata.NewLeaseManager(m.(*metadata.DB)),
                                gc:      g.(gcScheduler),
                        }, nil
                },
        })
}

type gcScheduler interface {
        ScheduleAndWait(context.Context) (gc.Stats, error)
}

type local struct {
        leases.Manager
        gc gcScheduler
}

func (l *local) Delete(ctx context.Context, lease leases.Lease, opts ...leases.DeleteOpt) error {
        var do leases.DeleteOptions
        for _, opt := range opts {
                if err := opt(ctx, &do); err != nil {
                        return err
                }
        }

        if err := l.Manager.Delete(ctx, lease); err != nil {
                return err
        }

        if do.Synchronous {
                if _, err := l.gc.ScheduleAndWait(ctx); err != nil {
                        return err
                }
        }

        return nil

}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package plugin

import (
        "fmt"
        "os"
        "path/filepath"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"

        bolt "go.etcd.io/bbolt"
)

const (
        boltOpenTimeout = "io.containerd.timeout.bolt.open"
)

func init() {
        timeout.Set(boltOpenTimeout, 0) // set to 0 means to wait indefinitely for bolt.Open
}

// BoltConfig defines the configuration values for the bolt plugin, which is
// loaded here, rather than back registered in the metadata package.
type BoltConfig struct {
        // ContentSharingPolicy sets the sharing policy for content between
        // namespaces.
        //
        // The default mode "shared" will make blobs available in all
        // namespaces once it is pulled into any namespace. The blob will be pulled
        // into the namespace if a writer is opened with the "Expected" digest that
        // is already present in the backend.
        //
        // The alternative mode, "isolated" requires that clients prove they have
        // access to the content by providing all of the content to the ingest
        // before the blob is added to the namespace.
        //
        // Both modes share backing data, while "shared" will reduce total
        // bandwidth across namespaces, at the cost of allowing access to any blob
        // just by knowing its digest.
        ContentSharingPolicy string `toml:"content_sharing_policy"`
}

const (
        // SharingPolicyShared represents the "shared" sharing policy
        SharingPolicyShared = "shared"
        // SharingPolicyIsolated represents the "isolated" sharing policy
        SharingPolicyIsolated = "isolated"
)

// Validate validates if BoltConfig is valid
func (bc *BoltConfig) Validate() error {
        switch bc.ContentSharingPolicy {
        case SharingPolicyShared, SharingPolicyIsolated:
                return nil
        default:
                return fmt.Errorf("unknown policy: %s: %w", bc.ContentSharingPolicy, errdefs.ErrInvalidArgument)
        }
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.MetadataPlugin,
                ID:   "bolt",
                Requires: []plugin.Type{
                        plugins.ContentPlugin,
                        plugins.EventPlugin,
                        plugins.SnapshotPlugin,
                },
                Config: &BoltConfig{
                        ContentSharingPolicy: SharingPolicyShared,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        root := ic.Properties[plugins.PropertyRootDir]
                        if err := os.MkdirAll(root, 0711); err != nil {
                                return nil, err
                        }
                        cs, err := ic.GetSingle(plugins.ContentPlugin)
                        if err != nil {
                                return nil, err
                        }

                        snapshottersRaw, err := ic.GetByType(plugins.SnapshotPlugin)
                        if err != nil {
                                return nil, err
                        }

                        snapshotters := make(map[string]snapshots.Snapshotter)
                        for name, sn := range snapshottersRaw {
                                snapshotters[name] = sn.(snapshots.Snapshotter)
                        }

                        ep, err := ic.GetSingle(plugins.EventPlugin)
                        if err != nil {
                                return nil, err
                        }

                        shared := true
                        ic.Meta.Exports["policy"] = SharingPolicyShared
                        if cfg, ok := ic.Config.(*BoltConfig); ok {
                                if cfg.ContentSharingPolicy != "" {
                                        if err := cfg.Validate(); err != nil {
                                                return nil, err
                                        }
                                        if cfg.ContentSharingPolicy == SharingPolicyIsolated {
                                                ic.Meta.Exports["policy"] = SharingPolicyIsolated
                                                shared = false
                                        }

                                        log.G(ic.Context).WithField("policy", cfg.ContentSharingPolicy).Info("metadata content store policy set")
                                }
                        }

                        path := filepath.Join(root, "meta.db")
                        ic.Meta.Exports["path"] = path

                        options := *bolt.DefaultOptions
                        // Reading bbolt's freelist sometimes fails when the file has a data corruption.
                        // Disabling freelist sync reduces the chance of the breakage.
                        // https://github.com/etcd-io/bbolt/pull/1
                        // https://github.com/etcd-io/bbolt/pull/6
                        options.NoFreelistSync = true
                        // Without the timeout, bbolt.Open would block indefinitely due to flock(2).
                        options.Timeout = timeout.Get(boltOpenTimeout)

                        doneCh := make(chan struct{})
                        go func() {
                                t := time.NewTimer(10 * time.Second)
                                defer t.Stop()
                                select {
                                case <-t.C:
                                        log.G(ic.Context).WithField("plugin", "bolt").Warn("waiting for response from boltdb open")
                                case <-doneCh:
                                        return
                                }
                        }()
                        db, err := bolt.Open(path, 0644, &options)
                        close(doneCh)
                        if err != nil {
                                return nil, err
                        }

                        dbopts := []metadata.DBOpt{
                                metadata.WithEventsPublisher(ep.(events.Publisher)),
                        }

                        if !shared {
                                dbopts = append(dbopts, metadata.WithPolicyIsolated)
                        }

                        mdb := metadata.NewDB(db, cs.(content.Store), snapshotters, dbopts...)
                        if err := mdb.Init(ic.Context); err != nil {
                                return nil, err
                        }
                        return mdb, nil
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package nri

import (
        "github.com/containerd/containerd/v2/internal/nri"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.NRIApiPlugin,
                ID:     "nri",
                Config: nri.DefaultConfig(),
                InitFn: initFunc,
        })
}

func initFunc(ic *plugin.InitContext) (interface{}, error) {
        l, err := nri.New(ic.Config.(*nri.Config))
        return l, err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package monitor

import (
        "context"
        "fmt"
        "net/url"
        "strconv"
        "syscall"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/runtime/restart"
        "github.com/containerd/containerd/v2/pkg/cio"
)

type stopChange struct {
        container containerd.Container
}

func (s *stopChange) apply(ctx context.Context, client *containerd.Client) error {
        return killTask(ctx, s.container)
}

type startChange struct {
        container containerd.Container
        logURI    string
        count     int
}

func (s *startChange) apply(ctx context.Context, client *containerd.Client) error {
        log := cio.NullIO
        spec, err := s.container.Spec(ctx)
        if err != nil {
                return err
        }
        useTTY := spec.Process.Terminal
        if s.logURI != "" {
                uri, err := url.Parse(s.logURI)
                if err != nil {
                        return fmt.Errorf("failed to parse %v into url: %w", s.logURI, err)
                }
                if useTTY {
                        log = cio.TerminalLogURI(uri)
                } else {
                        log = cio.LogURI(uri)
                }
        }

        if s.count > 0 {
                labels := map[string]string{
                        restart.CountLabel: strconv.Itoa(s.count),
                }
                opt := containerd.WithAdditionalContainerLabels(labels)
                if err := s.container.Update(ctx, containerd.UpdateContainerOpts(opt)); err != nil {
                        return err
                }
        }
        killTask(ctx, s.container)
        task, err := s.container.NewTask(ctx, log)
        if err != nil {
                return err
        }
        return task.Start(ctx)
}

func killTask(ctx context.Context, container containerd.Container) error {
        task, err := container.Task(ctx, nil)
        if err == nil {
                wait, err := task.Wait(ctx)
                if err != nil {
                        if _, derr := task.Delete(ctx); derr == nil {
                                return nil
                        }
                        return err
                }
                if err := task.Kill(ctx, syscall.SIGKILL, containerd.WithKillAll); err != nil {
                        if _, derr := task.Delete(ctx); derr == nil {
                                return nil
                        }
                        return err
                }
                <-wait
                if _, err := task.Delete(ctx); err != nil {
                        return err
                }
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package monitor

import (
        "context"
        "fmt"
        "strconv"
        "sync"
        "time"

        containerd "github.com/containerd/containerd/v2/client"
        "github.com/containerd/containerd/v2/core/runtime/restart"
        "github.com/containerd/containerd/v2/internal/tomlext"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/version"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

// Config for the restart monitor
type Config struct {
        // Interval for how long to wait to check for state changes
        Interval tomlext.Duration `toml:"interval"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ContainerMonitorPlugin,
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                        plugins.ServicePlugin,
                },
                ID: "restart",
                Config: &Config{
                        Interval: tomlext.FromStdTime(10 * time.Second),
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        ic.Meta.Capabilities = []string{"no", "always", "on-failure", "unless-stopped"}
                        client, err := containerd.New("", containerd.WithInMemoryServices(ic))
                        if err != nil {
                                return nil, err
                        }
                        m := &monitor{
                                client: client,
                        }
                        go m.run(tomlext.ToStdTime(ic.Config.(*Config).Interval))
                        return m, nil
                },
                ConfigMigration: func(ctx context.Context, configVersion int, pluginConfigs map[string]interface{}) error {
                        if configVersion >= version.ConfigVersion {
                                return nil
                        }
                        const pluginName = string(plugins.InternalPlugin) + ".restart"
                        c, ok := pluginConfigs[pluginName]
                        if ok {
                                pluginConfigs[string(plugins.ContainerMonitorPlugin)+".restart"] = c
                                delete(pluginConfigs, pluginName)
                        }

                        return nil
                },
        })
}

type change interface {
        apply(context.Context, *containerd.Client) error
}

type monitor struct {
        client *containerd.Client
}

func (m *monitor) run(interval time.Duration) {
        if interval == 0 {
                interval = 10 * time.Second
        }
        for {
                if err := m.reconcile(context.Background()); err != nil {
                        log.L.WithError(err).Error("reconcile")
                }
                time.Sleep(interval)
        }
}

func (m *monitor) reconcile(ctx context.Context) error {
        ns, err := m.client.NamespaceService().List(ctx)
        if err != nil {
                return err
        }
        var wgNSLoop sync.WaitGroup
        for _, name := range ns {
                name := name
                wgNSLoop.Add(1)
                go func() {
                        defer wgNSLoop.Done()
                        ctx := namespaces.WithNamespace(ctx, name)
                        changes, err := m.monitor(ctx)
                        if err != nil {
                                log.G(ctx).WithError(err).Error("monitor for changes")
                                return
                        }
                        var wgChangesLoop sync.WaitGroup
                        for _, c := range changes {
                                c := c
                                wgChangesLoop.Add(1)
                                go func() {
                                        defer wgChangesLoop.Done()
                                        if err := c.apply(ctx, m.client); err != nil {
                                                log.G(ctx).WithError(err).Error("apply change")
                                        }
                                }()
                        }
                        wgChangesLoop.Wait()
                }()
        }
        wgNSLoop.Wait()
        return nil
}

func (m *monitor) monitor(ctx context.Context) ([]change, error) {
        containers, err := m.client.Containers(ctx, fmt.Sprintf("labels.%q", restart.StatusLabel))
        if err != nil {
                return nil, err
        }
        var changes []change
        for _, c := range containers {
                var (
                        task   containerd.Task
                        status containerd.Status
                        err    error
                )
                labels, err := c.Labels(ctx)
                if err != nil {
                        return nil, err
                }
                desiredStatus := containerd.ProcessStatus(labels[restart.StatusLabel])
                if task, err = c.Task(ctx, nil); err == nil {
                        if status, err = task.Status(ctx); err == nil {
                                if desiredStatus == status.Status {
                                        continue
                                }
                        }
                }

                // Task or Status return error, only desired to running
                if err != nil {
                        log.G(ctx).WithError(err).Error("monitor")
                        if desiredStatus == containerd.Stopped {
                                continue
                        }
                }

                // Known issue:
                // The status may be empty when task failed but was deleted,
                // which will result in an `on-failure` restart policy reconcile error.
                switch desiredStatus {
                case containerd.Running:
                        switch status.Status {
                        case containerd.Paused, containerd.Pausing:
                                continue
                        default:
                        }
                        if !restart.Reconcile(status, labels) {
                                continue
                        }

                        restartCount, _ := strconv.Atoi(labels[restart.CountLabel])
                        if labels["containerd.io/restart.logpath"] != "" {
                                log.G(ctx).Warn(`Label "containerd.io/restart.logpath" is no longer supported since containerd v2.0. Use "containerd.io/restart.loguri" instead.`)
                        }
                        changes = append(changes, &startChange{
                                container: c,
                                logURI:    labels[restart.LogURILabel],
                                count:     restartCount + 1,
                        })
                case containerd.Stopped:
                        changes = append(changes, &stopChange{
                                container: c,
                        })
                }
        }
        return changes, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "context"
        "fmt"
        "os"
        "time"

        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/protobuf/types/known/anypb"

        runtimeAPI "github.com/containerd/containerd/api/runtime/sandbox/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/runtime"
        v2 "github.com/containerd/containerd/v2/core/runtime/v2"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/plugins"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.SandboxControllerPlugin,
                ID:   "shim",
                Requires: []plugin.Type{
                        plugins.ShimPlugin,
                        plugins.EventPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        shimPlugin, err := ic.GetByID(plugins.ShimPlugin, "shim")
                        if err != nil {
                                return nil, err
                        }

                        exchangePlugin, err := ic.GetByID(plugins.EventPlugin, "exchange")
                        if err != nil {
                                return nil, err
                        }

                        var (
                                shims     = shimPlugin.(*v2.ShimManager)
                                publisher = exchangePlugin.(*exchange.Exchange)
                        )
                        state := ic.Properties[plugins.PropertyStateDir]
                        root := ic.Properties[plugins.PropertyRootDir]
                        for _, d := range []string{root, state} {
                                if err := os.MkdirAll(d, 0711); err != nil {
                                        return nil, err
                                }
                        }

                        if err := shims.LoadExistingShims(ic.Context, root, state); err != nil {
                                return nil, fmt.Errorf("failed to load existing shim sandboxes, %v", err)
                        }

                        c := &controllerLocal{
                                root:      root,
                                state:     state,
                                shims:     shims,
                                publisher: publisher,
                        }
                        return c, nil
                },
        })
}

type controllerLocal struct {
        root      string
        state     string
        shims     *v2.ShimManager
        publisher events.Publisher
}

var _ sandbox.Controller = (*controllerLocal)(nil)

func (c *controllerLocal) cleanupShim(ctx context.Context, sandboxID string, svc runtimeAPI.TTRPCSandboxService) {
        // Let the shim exit, then we can clean up the bundle after.
        if _, sErr := svc.ShutdownSandbox(ctx, &runtimeAPI.ShutdownSandboxRequest{
                SandboxID: sandboxID,
        }); sErr != nil {
                log.G(ctx).WithError(sErr).WithField("sandboxID", sandboxID).
                        Error("failed to shutdown sandbox")
        }

        ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
        defer cancel()

        dErr := c.shims.Delete(ctx, sandboxID)
        if dErr != nil {
                log.G(ctx).WithError(dErr).WithField("sandboxID", sandboxID).
                        Error("failed to delete shim")
        }
}

func (c *controllerLocal) Create(ctx context.Context, info sandbox.Sandbox, opts ...sandbox.CreateOpt) (retErr error) {
        var coptions sandbox.CreateOptions
        sandboxID := info.ID
        for _, opt := range opts {
                opt(&coptions)
        }

        if _, err := c.shims.Get(ctx, sandboxID); err == nil {
                return fmt.Errorf("sandbox %s already running: %w", sandboxID, errdefs.ErrAlreadyExists)
        }

        bundle, err := v2.NewBundle(ctx, c.root, c.state, sandboxID, info.Spec)
        if err != nil {
                return err
        }
        defer func() {
                if retErr != nil {
                        bundle.Delete()
                }
        }()

        shim, err := c.shims.Start(ctx, sandboxID, bundle, runtime.CreateOpts{
                Spec:           info.Spec,
                RuntimeOptions: info.Runtime.Options,
                Runtime:        info.Runtime.Name,
                TaskOptions:    nil,
        })
        if err != nil {
                return fmt.Errorf("failed to start new shim for sandbox %s: %w", sandboxID, err)
        }

        svc, err := sandbox.NewClient(shim.Client())
        if err != nil {
                return err
        }

        var options *anypb.Any
        if coptions.Options != nil {
                options = &anypb.Any{
                        TypeUrl: coptions.Options.GetTypeUrl(),
                        Value:   coptions.Options.GetValue(),
                }
        }

        if _, err := svc.CreateSandbox(ctx, &runtimeAPI.CreateSandboxRequest{
                SandboxID:  sandboxID,
                BundlePath: shim.Bundle(),
                Rootfs:     mount.ToProto(coptions.Rootfs),
                Options:    options,
                NetnsPath:  coptions.NetNSPath,
        }); err != nil {
                c.cleanupShim(ctx, sandboxID, svc)
                return fmt.Errorf("failed to create sandbox %s: %w", sandboxID, errdefs.FromGRPC(err))
        }

        return nil
}

func (c *controllerLocal) Start(ctx context.Context, sandboxID string) (sandbox.ControllerInstance, error) {
        shim, err := c.shims.Get(ctx, sandboxID)
        if err != nil {
                return sandbox.ControllerInstance{}, fmt.Errorf("unable to find sandbox %q", sandboxID)
        }

        svc, err := sandbox.NewClient(shim.Client())
        if err != nil {
                return sandbox.ControllerInstance{}, err
        }

        resp, err := svc.StartSandbox(ctx, &runtimeAPI.StartSandboxRequest{SandboxID: sandboxID})
        if err != nil {
                c.cleanupShim(ctx, sandboxID, svc)
                return sandbox.ControllerInstance{}, fmt.Errorf("failed to start sandbox %s: %w", sandboxID, errdefs.FromGRPC(err))
        }
        address, version := shim.Endpoint()
        return sandbox.ControllerInstance{
                SandboxID: sandboxID,
                Pid:       resp.GetPid(),
                Address:   address,
                Version:   uint32(version),
                CreatedAt: resp.GetCreatedAt().AsTime(),
        }, nil
}

func (c *controllerLocal) Platform(ctx context.Context, sandboxID string) (platforms.Platform, error) {
        svc, err := c.getSandbox(ctx, sandboxID)
        if err != nil {
                return platforms.Platform{}, err
        }

        response, err := svc.Platform(ctx, &runtimeAPI.PlatformRequest{SandboxID: sandboxID})
        if err != nil {
                return platforms.Platform{}, fmt.Errorf("failed to get sandbox platform: %w", errdefs.FromGRPC(err))
        }

        var platform platforms.Platform
        if p := response.GetPlatform(); p != nil {
                platform.OS = p.OS
                platform.Architecture = p.Architecture
                platform.Variant = p.Variant
        }
        return platform, nil
}

func (c *controllerLocal) Stop(ctx context.Context, sandboxID string, opts ...sandbox.StopOpt) error {
        var soptions sandbox.StopOptions
        for _, opt := range opts {
                opt(&soptions)
        }
        req := &runtimeAPI.StopSandboxRequest{SandboxID: sandboxID}
        if soptions.Timeout != nil {
                req.TimeoutSecs = uint32(soptions.Timeout.Seconds())
        }

        svc, err := c.getSandbox(ctx, sandboxID)
        if errdefs.IsNotFound(err) {
                return nil
        }
        if err != nil {
                return err
        }

        if _, err := svc.StopSandbox(ctx, req); err != nil {
                err = errdefs.FromGRPC(err)
                if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) {
                        return fmt.Errorf("failed to stop sandbox: %w", err)
                }
        }

        return nil
}

func (c *controllerLocal) Shutdown(ctx context.Context, sandboxID string) error {
        svc, err := c.getSandbox(ctx, sandboxID)
        if err != nil {
                return err
        }

        _, err = svc.ShutdownSandbox(ctx, &runtimeAPI.ShutdownSandboxRequest{SandboxID: sandboxID})
        if err != nil {
                return fmt.Errorf("failed to shutdown sandbox: %w", errdefs.FromGRPC(err))
        }

        if err := c.shims.Delete(ctx, sandboxID); err != nil {
                return fmt.Errorf("failed to delete sandbox shim: %w", err)
        }

        return nil
}

func (c *controllerLocal) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) {
        svc, err := c.getSandbox(ctx, sandboxID)
        if err != nil {
                return sandbox.ExitStatus{}, err
        }

        resp, err := svc.WaitSandbox(ctx, &runtimeAPI.WaitSandboxRequest{
                SandboxID: sandboxID,
        })

        if err != nil {
                return sandbox.ExitStatus{}, fmt.Errorf("failed to wait sandbox %s: %w", sandboxID, errdefs.FromGRPC(err))
        }

        return sandbox.ExitStatus{
                ExitStatus: resp.GetExitStatus(),
                ExitedAt:   resp.GetExitedAt().AsTime(),
        }, nil
}

func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
        svc, err := c.getSandbox(ctx, sandboxID)
        if errdefs.IsNotFound(err) {
                return sandbox.ControllerStatus{
                        SandboxID: sandboxID,
                        ExitedAt:  time.Now(),
                }, nil
        }
        if err != nil {
                return sandbox.ControllerStatus{}, err
        }

        resp, err := svc.SandboxStatus(ctx, &runtimeAPI.SandboxStatusRequest{
                SandboxID: sandboxID,
                Verbose:   verbose,
        })
        if err != nil {
                return sandbox.ControllerStatus{}, fmt.Errorf("failed to query sandbox %s status: %w", sandboxID, err)
        }

        shim, err := c.shims.Get(ctx, sandboxID)
        if err != nil {
                return sandbox.ControllerStatus{}, fmt.Errorf("unable to find sandbox %q", sandboxID)
        }
        address, version := shim.Endpoint()

        return sandbox.ControllerStatus{
                SandboxID: resp.GetSandboxID(),
                Pid:       resp.GetPid(),
                State:     resp.GetState(),
                Info:      resp.GetInfo(),
                CreatedAt: resp.GetCreatedAt().AsTime(),
                ExitedAt:  resp.GetExitedAt().AsTime(),
                Extra:     resp.GetExtra(),
                Address:   address,
                Version:   uint32(version),
        }, nil
}

func (c *controllerLocal) Metrics(ctx context.Context, sandboxID string) (*types.Metric, error) {
        sb, err := c.getSandbox(ctx, sandboxID)
        if err != nil {
                return nil, err
        }
        req := &runtimeAPI.SandboxMetricsRequest{SandboxID: sandboxID}
        resp, err := sb.SandboxMetrics(ctx, req)
        if err != nil {
                return nil, err
        }
        return resp.Metrics, nil
}

func (c *controllerLocal) getSandbox(ctx context.Context, id string) (runtimeAPI.TTRPCSandboxService, error) {
        shim, err := c.shims.Get(ctx, id)
        if err != nil {
                return nil, err
        }

        return sandbox.NewClient(shim.Client())
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package containers

import (
        api "github.com/containerd/containerd/api/services/containers/v1"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/typeurl/v2"
)

func containersToProto(containers []containers.Container) []*api.Container {
        var containerspb []*api.Container

        for _, image := range containers {
                image := image
                containerspb = append(containerspb, containerToProto(&image))
        }

        return containerspb
}

func containerToProto(container *containers.Container) *api.Container {
        extensions := make(map[string]*types.Any)
        for k, v := range container.Extensions {
                extensions[k] = protobuf.FromAny(v)
        }
        return &api.Container{
                ID:     container.ID,
                Labels: container.Labels,
                Image:  container.Image,
                Runtime: &api.Container_Runtime{
                        Name:    container.Runtime.Name,
                        Options: protobuf.FromAny(container.Runtime.Options),
                },
                Spec:        protobuf.FromAny(container.Spec),
                Snapshotter: container.Snapshotter,
                SnapshotKey: container.SnapshotKey,
                CreatedAt:   protobuf.ToTimestamp(container.CreatedAt),
                UpdatedAt:   protobuf.ToTimestamp(container.UpdatedAt),
                Extensions:  extensions,
                Sandbox:     container.SandboxID,
        }
}

func containerFromProto(containerpb *api.Container) containers.Container {
        var runtime containers.RuntimeInfo
        if containerpb.Runtime != nil {
                runtime = containers.RuntimeInfo{
                        Name:    containerpb.Runtime.Name,
                        Options: containerpb.Runtime.Options,
                }
        }
        extensions := make(map[string]typeurl.Any)
        for k, v := range containerpb.Extensions {
                v := v
                extensions[k] = v
        }
        return containers.Container{
                ID:          containerpb.ID,
                Labels:      containerpb.Labels,
                Image:       containerpb.Image,
                Runtime:     runtime,
                Spec:        containerpb.Spec,
                Snapshotter: containerpb.Snapshotter,
                SnapshotKey: containerpb.SnapshotKey,
                Extensions:  extensions,
                SandboxID:   containerpb.Sandbox,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package containers

import (
        "context"
        "io"

        eventstypes "github.com/containerd/containerd/api/events"
        api "github.com/containerd/containerd/api/services/containers/v1"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/metadata"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        bolt "go.etcd.io/bbolt"
        "google.golang.org/grpc"
        "google.golang.org/grpc/codes"
        grpcm "google.golang.org/grpc/metadata"
        "google.golang.org/grpc/status"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ServicePlugin,
                ID:   services.ContainersService,
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        ep, err := ic.GetSingle(plugins.EventPlugin)
                        if err != nil {
                                return nil, err
                        }

                        db := m.(*metadata.DB)
                        return &local{
                                Store:     metadata.NewContainerStore(db),
                                db:        db,
                                publisher: ep.(events.Publisher),
                        }, nil
                },
        })
}

type local struct {
        containers.Store
        db        *metadata.DB
        publisher events.Publisher
}

var _ api.ContainersClient = &local{}

func (l *local) Get(ctx context.Context, req *api.GetContainerRequest, _ ...grpc.CallOption) (*api.GetContainerResponse, error) {
        var resp api.GetContainerResponse

        return &resp, errdefs.ToGRPC(l.withStoreView(ctx, func(ctx context.Context) error {
                container, err := l.Store.Get(ctx, req.ID)
                if err != nil {
                        return err
                }
                containerpb := containerToProto(&container)
                resp.Container = containerpb

                return nil
        }))
}

func (l *local) List(ctx context.Context, req *api.ListContainersRequest, _ ...grpc.CallOption) (*api.ListContainersResponse, error) {
        var resp api.ListContainersResponse
        return &resp, errdefs.ToGRPC(l.withStoreView(ctx, func(ctx context.Context) error {
                containers, err := l.Store.List(ctx, req.Filters...)
                if err != nil {
                        return err
                }
                resp.Containers = containersToProto(containers)
                return nil
        }))
}

func (l *local) ListStream(ctx context.Context, req *api.ListContainersRequest, _ ...grpc.CallOption) (api.Containers_ListStreamClient, error) {
        stream := &localStream{
                ctx: ctx,
        }
        return stream, errdefs.ToGRPC(l.withStoreView(ctx, func(ctx context.Context) error {
                containers, err := l.Store.List(ctx, req.Filters...)
                if err != nil {
                        return err
                }
                stream.containers = containersToProto(containers)
                return nil
        }))
}

func (l *local) Create(ctx context.Context, req *api.CreateContainerRequest, _ ...grpc.CallOption) (*api.CreateContainerResponse, error) {
        var resp api.CreateContainerResponse

        if err := l.withStoreUpdate(ctx, func(ctx context.Context) error {
                container := containerFromProto(req.Container)

                created, err := l.Store.Create(ctx, container)
                if err != nil {
                        return err
                }

                resp.Container = containerToProto(&created)

                return nil
        }); err != nil {
                return &resp, errdefs.ToGRPC(err)
        }
        if err := l.publisher.Publish(ctx, "/containers/create", &eventstypes.ContainerCreate{
                ID:    resp.Container.ID,
                Image: resp.Container.Image,
                Runtime: &eventstypes.ContainerCreate_Runtime{
                        Name:    resp.Container.Runtime.Name,
                        Options: resp.Container.Runtime.Options,
                },
        }); err != nil {
                return &resp, err
        }

        return &resp, nil
}

func (l *local) Update(ctx context.Context, req *api.UpdateContainerRequest, _ ...grpc.CallOption) (*api.UpdateContainerResponse, error) {
        if req.Container.ID == "" {
                return nil, status.Errorf(codes.InvalidArgument, "Container.ID required")
        }
        var (
                resp      api.UpdateContainerResponse
                container = containerFromProto(req.Container)
        )

        if err := l.withStoreUpdate(ctx, func(ctx context.Context) error {
                var fieldpaths []string
                if req.UpdateMask != nil && len(req.UpdateMask.Paths) > 0 {
                        fieldpaths = append(fieldpaths, req.UpdateMask.Paths...)
                }

                updated, err := l.Store.Update(ctx, container, fieldpaths...)
                if err != nil {
                        return err
                }

                resp.Container = containerToProto(&updated)
                return nil
        }); err != nil {
                return &resp, errdefs.ToGRPC(err)
        }

        if err := l.publisher.Publish(ctx, "/containers/update", &eventstypes.ContainerUpdate{
                ID:          resp.Container.ID,
                Image:       resp.Container.Image,
                Labels:      resp.Container.Labels,
                SnapshotKey: resp.Container.SnapshotKey,
        }); err != nil {
                return &resp, err
        }

        return &resp, nil
}

func (l *local) Delete(ctx context.Context, req *api.DeleteContainerRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        if err := l.withStoreUpdate(ctx, func(ctx context.Context) error {
                return l.Store.Delete(ctx, req.ID)
        }); err != nil {
                return &ptypes.Empty{}, errdefs.ToGRPC(err)
        }

        if err := l.publisher.Publish(ctx, "/containers/delete", &eventstypes.ContainerDelete{
                ID: req.ID,
        }); err != nil {
                return &ptypes.Empty{}, err
        }

        return &ptypes.Empty{}, nil
}

func (l *local) withStore(ctx context.Context, fn func(ctx context.Context) error) func(tx *bolt.Tx) error {
        return func(tx *bolt.Tx) error {
                return fn(metadata.WithTransactionContext(ctx, tx))
        }
}

func (l *local) withStoreView(ctx context.Context, fn func(ctx context.Context) error) error {
        return l.db.View(l.withStore(ctx, fn))
}

func (l *local) withStoreUpdate(ctx context.Context, fn func(ctx context.Context) error) error {
        return l.db.Update(l.withStore(ctx, fn))
}

type localStream struct {
        ctx        context.Context
        containers []*api.Container
        i          int
}

func (s *localStream) Recv() (*api.ListContainerMessage, error) {
        if s.i >= len(s.containers) {
                return nil, io.EOF
        }
        c := s.containers[s.i]
        s.i++
        return &api.ListContainerMessage{
                Container: c,
        }, nil
}

func (s *localStream) Context() context.Context {
        return s.ctx
}

func (s *localStream) CloseSend() error {
        return nil
}

func (s *localStream) Header() (grpcm.MD, error) {
        return nil, nil
}

func (s *localStream) Trailer() grpcm.MD {
        return nil
}

func (s *localStream) SendMsg(m interface{}) error {
        return nil
}

func (s *localStream) RecvMsg(m interface{}) error {
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package containers

import (
        "context"
        "io"

        api "github.com/containerd/containerd/api/services/containers/v1"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "containers",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.ServicePlugin, services.ContainersService)
                        if err != nil {
                                return nil, err
                        }
                        return &service{local: i.(api.ContainersClient)}, nil
                },
        })
}

type service struct {
        local api.ContainersClient
        api.UnimplementedContainersServer
}

var _ api.ContainersServer = &service{}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterContainersServer(server, s)
        return nil
}

func (s *service) Get(ctx context.Context, req *api.GetContainerRequest) (*api.GetContainerResponse, error) {
        return s.local.Get(ctx, req)
}

func (s *service) List(ctx context.Context, req *api.ListContainersRequest) (*api.ListContainersResponse, error) {
        return s.local.List(ctx, req)
}

func (s *service) ListStream(req *api.ListContainersRequest, stream api.Containers_ListStreamServer) error {
        containers, err := s.local.ListStream(stream.Context(), req)
        if err != nil {
                return err
        }
        for {
                select {
                case <-stream.Context().Done():
                        return nil
                default:
                        c, err := containers.Recv()
                        if err != nil {
                                if err == io.EOF {
                                        return nil
                                }
                                return err
                        }
                        if err := stream.Send(c); err != nil {
                                return err
                        }
                }
        }
}

func (s *service) Create(ctx context.Context, req *api.CreateContainerRequest) (*api.CreateContainerResponse, error) {
        return s.local.Create(ctx, req)
}

func (s *service) Update(ctx context.Context, req *api.UpdateContainerRequest) (*api.UpdateContainerResponse, error) {
        return s.local.Update(ctx, req)
}

func (s *service) Delete(ctx context.Context, req *api.DeleteContainerRequest) (*ptypes.Empty, error) {
        return s.local.Delete(ctx, req)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package contentserver

import (
        "context"
        "fmt"
        "io"
        "sync"

        api "github.com/containerd/containerd/api/services/content/v1"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        digest "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "google.golang.org/grpc"
        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
)

type service struct {
        store content.Store
        api.UnimplementedContentServer
}

var (
        empty   = &ptypes.Empty{}
        bufPool = sync.Pool{
                New: func() interface{} {
                        buffer := make([]byte, 1<<20)
                        return &buffer
                },
        }
)

// New returns the content GRPC server
func New(cs content.Store) api.ContentServer {
        return &service{store: cs}
}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterContentServer(server, s)
        return nil
}

func (s *service) Info(ctx context.Context, req *api.InfoRequest) (*api.InfoResponse, error) {
        dg, err := digest.Parse(req.Digest)
        if err != nil {
                return nil, status.Errorf(codes.InvalidArgument, "%q failed validation", req.Digest)
        }

        bi, err := s.store.Info(ctx, dg)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.InfoResponse{
                Info: infoToGRPC(bi),
        }, nil
}

func (s *service) Update(ctx context.Context, req *api.UpdateRequest) (*api.UpdateResponse, error) {
        _, err := digest.Parse(req.Info.Digest)
        if err != nil {
                return nil, status.Errorf(codes.InvalidArgument, "%q failed validation", req.Info.Digest)
        }

        info, err := s.store.Update(ctx, infoFromGRPC(req.Info), req.UpdateMask.GetPaths()...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.UpdateResponse{
                Info: infoToGRPC(info),
        }, nil
}

func (s *service) List(req *api.ListContentRequest, session api.Content_ListServer) error {
        var (
                buffer    []*api.Info
                sendBlock = func(block []*api.Info) error {
                        // send last block
                        return session.Send(&api.ListContentResponse{
                                Info: block,
                        })
                }
        )

        if err := s.store.Walk(session.Context(), func(info content.Info) error {
                buffer = append(buffer, infoToGRPC(info))

                if len(buffer) >= 100 {
                        if err := sendBlock(buffer); err != nil {
                                return err
                        }

                        buffer = buffer[:0]
                }

                return nil
        }, req.Filters...); err != nil {
                return errdefs.ToGRPC(err)
        }

        if len(buffer) > 0 {
                // send last block
                if err := sendBlock(buffer); err != nil {
                        return err
                }
        }

        return nil
}

func (s *service) Delete(ctx context.Context, req *api.DeleteContentRequest) (*ptypes.Empty, error) {
        log.G(ctx).WithField("digest", req.Digest).Debugf("delete content")
        dg, err := digest.Parse(req.Digest)
        if err != nil {
                return nil, status.Errorf(codes.InvalidArgument, err.Error())
        }

        if err := s.store.Delete(ctx, dg); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return empty, nil
}

func (s *service) Read(req *api.ReadContentRequest, session api.Content_ReadServer) error {
        dg, err := digest.Parse(req.Digest)
        if err != nil {
                return status.Errorf(codes.InvalidArgument, "%v: %v", req.Digest, err)
        }

        oi, err := s.store.Info(session.Context(), dg)
        if err != nil {
                return errdefs.ToGRPC(err)
        }

        ra, err := s.store.ReaderAt(session.Context(), ocispec.Descriptor{Digest: dg})
        if err != nil {
                return errdefs.ToGRPC(err)
        }
        defer ra.Close()

        var (
                offset = req.Offset
                // size is read size, not the expected size of the blob (oi.Size), which the caller might not be aware of.
                // offset+size can be larger than oi.Size.
                size = req.Size

                // TODO(stevvooe): Using the global buffer pool. At 32KB, it is probably
                // little inefficient for work over a fast network. We can tune this later.
                p = bufPool.Get().(*[]byte)
        )
        defer bufPool.Put(p)

        if offset < 0 {
                offset = 0
        }

        if offset > oi.Size {
                return status.Errorf(codes.OutOfRange, "read past object length %v bytes", oi.Size)
        }

        if size <= 0 || offset+size > oi.Size {
                size = oi.Size - offset
        }

        _, err = io.CopyBuffer(
                &readResponseWriter{session: session},
                io.NewSectionReader(ra, offset, size), *p)
        return errdefs.ToGRPC(err)
}

// readResponseWriter is a writer that places the output into ReadContentRequest messages.
//
// This allows io.CopyBuffer to do the heavy lifting of chunking the responses
// into the buffer size.
type readResponseWriter struct {
        offset  int64
        session api.Content_ReadServer
}

func (rw *readResponseWriter) Write(p []byte) (n int, err error) {
        if err := rw.session.Send(&api.ReadContentResponse{
                Offset: rw.offset,
                Data:   p,
        }); err != nil {
                return 0, err
        }

        rw.offset += int64(len(p))
        return len(p), nil
}

func (s *service) Status(ctx context.Context, req *api.StatusRequest) (*api.StatusResponse, error) {
        status, err := s.store.Status(ctx, req.Ref)
        if err != nil {
                return nil, errdefs.ToGRPCf(err, "could not get status for ref %q", req.Ref)
        }

        var resp api.StatusResponse
        resp.Status = &api.Status{
                StartedAt: protobuf.ToTimestamp(status.StartedAt),
                UpdatedAt: protobuf.ToTimestamp(status.UpdatedAt),
                Ref:       status.Ref,
                Offset:    status.Offset,
                Total:     status.Total,
                Expected:  status.Expected.String(),
        }

        return &resp, nil
}

func (s *service) ListStatuses(ctx context.Context, req *api.ListStatusesRequest) (*api.ListStatusesResponse, error) {
        statuses, err := s.store.ListStatuses(ctx, req.Filters...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        var resp api.ListStatusesResponse
        for _, status := range statuses {
                resp.Statuses = append(resp.Statuses, &api.Status{
                        StartedAt: protobuf.ToTimestamp(status.StartedAt),
                        UpdatedAt: protobuf.ToTimestamp(status.UpdatedAt),
                        Ref:       status.Ref,
                        Offset:    status.Offset,
                        Total:     status.Total,
                        Expected:  status.Expected.String(),
                })
        }

        return &resp, nil
}

func (s *service) Write(session api.Content_WriteServer) (err error) {
        var (
                ctx      = session.Context()
                msg      api.WriteContentResponse
                req      *api.WriteContentRequest
                ref      string
                total    int64
                expected digest.Digest
        )

        defer func(msg *api.WriteContentResponse) {
                // pump through the last message if no error was encountered
                if err != nil {
                        if s, ok := status.FromError(err); ok && s.Code() != codes.AlreadyExists {
                                // TODO(stevvooe): Really need a log line here to track which
                                // errors are actually causing failure on the server side. May want
                                // to configure the service with an interceptor to make this work
                                // identically across all GRPC methods.
                                //
                                // This is pretty noisy, so we can remove it but leave it for now.
                                log.G(ctx).WithError(err).Error("(*service).Write failed")
                        }

                        return
                }

                err = session.Send(msg)
        }(&msg)

        // handle the very first request!
        req, err = session.Recv()
        if err != nil {
                return err
        }

        ref = req.Ref

        if ref == "" {
                return status.Errorf(codes.InvalidArgument, "first message must have a reference")
        }

        fields := log.Fields{
                "ref": ref,
        }
        total = req.Total
        expected = digest.Digest(req.Expected)
        if total > 0 {
                fields["total"] = total
        }

        if expected != "" {
                fields["expected"] = expected
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(fields))

        log.G(ctx).Debug("(*service).Write started")
        // this action locks the writer for the session.
        wr, err := s.store.Writer(ctx,
                content.WithRef(ref),
                content.WithDescriptor(ocispec.Descriptor{Size: total, Digest: expected}))
        if err != nil {
                return errdefs.ToGRPC(err)
        }
        defer wr.Close()

        for {
                msg.Action = req.Action
                ws, err := wr.Status()
                if err != nil {
                        return errdefs.ToGRPC(err)
                }

                msg.Offset = ws.Offset // always set the offset.

                // NOTE(stevvooe): In general, there are two cases underwhich a remote
                // writer is used.
                //
                // For pull, we almost always have this before fetching large content,
                // through descriptors. We allow predeclaration of the expected size
                // and digest.
                //
                // For push, it is more complex. If we want to cut through content into
                // storage, we may have no expectation until we are done processing the
                // content. The case here is the following:
                //
                //         1. Start writing content.
                //         2. Compress inline.
                //         3. Validate digest and size (maybe).
                //
                // Supporting these two paths is quite awkward but it lets both API
                // users use the same writer style for each with a minimum of overhead.
                if req.Expected != "" {
                        dg := digest.Digest(req.Expected)
                        if expected != "" && expected != dg {
                                log.G(ctx).Debugf("commit digest differs from writer digest: %v != %v", dg, expected)
                        }
                        expected = dg

                        if _, err := s.store.Info(session.Context(), dg); err == nil {
                                if err := wr.Close(); err != nil {
                                        log.G(ctx).WithError(err).Error("failed to close writer")
                                }
                                if err := s.store.Abort(session.Context(), ref); err != nil {
                                        log.G(ctx).WithError(err).Error("failed to abort write")
                                }

                                return status.Errorf(codes.AlreadyExists, "blob with expected digest %v exists", req.Expected)
                        }
                }

                if req.Total > 0 {
                        // Update the expected total. Typically, this could be seen at
                        // negotiation time or on a commit message.
                        if total > 0 && req.Total != total {
                                log.G(ctx).Debugf("commit size differs from writer size: %v != %v", req.Total, total)
                        }
                        total = req.Total
                }

                switch req.Action {
                case api.WriteAction_STAT:
                        msg.Digest = wr.Digest().String()
                        msg.StartedAt = protobuf.ToTimestamp(ws.StartedAt)
                        msg.UpdatedAt = protobuf.ToTimestamp(ws.UpdatedAt)
                        msg.Total = total
                case api.WriteAction_WRITE, api.WriteAction_COMMIT:
                        if req.Offset > 0 {
                                // validate the offset if provided
                                if req.Offset != ws.Offset {
                                        return status.Errorf(codes.OutOfRange, "write @%v must occur at current offset %v", req.Offset, ws.Offset)
                                }
                        }

                        if req.Offset == 0 && ws.Offset > 0 {
                                if err := wr.Truncate(req.Offset); err != nil {
                                        return fmt.Errorf("truncate failed: %w", err)
                                }
                                msg.Offset = req.Offset
                        }

                        // issue the write if we actually have data.
                        if len(req.Data) > 0 {
                                // While this looks like we could use io.WriterAt here, because we
                                // maintain the offset as append only, we just issue the write.
                                n, err := wr.Write(req.Data)
                                if err != nil {
                                        return errdefs.ToGRPC(err)
                                }

                                if n != len(req.Data) {
                                        // TODO(stevvooe): Perhaps, we can recover this by including it
                                        // in the offset on the write return.
                                        return status.Errorf(codes.DataLoss, "wrote %v of %v bytes", n, len(req.Data))
                                }

                                msg.Offset += int64(n)
                        }

                        if req.Action == api.WriteAction_COMMIT {
                                var opts []content.Opt
                                if req.Labels != nil {
                                        opts = append(opts, content.WithLabels(req.Labels))
                                }
                                if err := wr.Commit(ctx, total, expected, opts...); err != nil {
                                        return errdefs.ToGRPC(err)
                                }
                        }

                        msg.Digest = wr.Digest().String()
                }

                if err := session.Send(&msg); err != nil {
                        return err
                }

                if req.Action == api.WriteAction_COMMIT {
                        return nil
                }

                req, err = session.Recv()
                if err != nil {
                        if err == io.EOF {
                                return nil
                        }

                        return err
                }
        }
}

func (s *service) Abort(ctx context.Context, req *api.AbortRequest) (*ptypes.Empty, error) {
        if err := s.store.Abort(ctx, req.Ref); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return empty, nil
}

func infoToGRPC(info content.Info) *api.Info {
        return &api.Info{
                Digest:    info.Digest.String(),
                Size:      info.Size,
                CreatedAt: protobuf.ToTimestamp(info.CreatedAt),
                UpdatedAt: protobuf.ToTimestamp(info.UpdatedAt),
                Labels:    info.Labels,
        }
}

func infoFromGRPC(info *api.Info) content.Info {
        return content.Info{
                Digest:    digest.Digest(info.Digest),
                Size:      info.Size,
                CreatedAt: protobuf.FromTimestamp(info.CreatedAt),
                UpdatedAt: protobuf.FromTimestamp(info.UpdatedAt),
                Labels:    info.Labels,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package content

import (
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/containerd/v2/plugins/services/content/contentserver"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "content",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        cs, err := ic.GetByID(plugins.ServicePlugin, services.ContentService)
                        if err != nil {
                                return nil, err
                        }
                        return contentserver.New(cs.(content.Store)), nil
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package content

import (
        "context"

        eventstypes "github.com/containerd/containerd/api/events"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        digest "github.com/opencontainers/go-digest"
)

// store wraps content.Store with proper event published.
type store struct {
        content.Store
        publisher events.Publisher
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ServicePlugin,
                ID:   services.ContentService,
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        ep, err := ic.GetSingle(plugins.EventPlugin)
                        if err != nil {
                                return nil, err
                        }

                        s, err := newContentStore(m.(*metadata.DB).ContentStore(), ep.(events.Publisher))
                        return s, err
                },
        })
}

func newContentStore(cs content.Store, publisher events.Publisher) (content.Store, error) {
        return &store{
                Store:     cs,
                publisher: publisher,
        }, nil
}

func (s *store) Delete(ctx context.Context, dgst digest.Digest) error {
        if err := s.Store.Delete(ctx, dgst); err != nil {
                return err
        }
        // TODO: Consider whether we should return error here.
        return s.publisher.Publish(ctx, "/content/delete", &eventstypes.ContentDelete{
                Digest: dgst.String(),
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package diff

import (
        "context"
        "fmt"

        diffapi "github.com/containerd/containerd/api/services/diff/v1"
        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "google.golang.org/grpc"
)

type config struct {
        // Order is the order of preference in which to try diff algorithms, the
        // first differ which is supported is used.
        // Note when multiple differs may be supported, this order will be
        // respected for which is chosen. Each differ should return the same
        // correct output, allowing any ordering to be used to prefer
        // more optimimal implementations.
        Order []string `toml:"default"`
}

type differ interface {
        diff.Comparer
        diff.Applier
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ServicePlugin,
                ID:   services.DiffService,
                Requires: []plugin.Type{
                        plugins.DiffPlugin,
                },
                Config: defaultDifferConfig,
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        differs, err := ic.GetByType(plugins.DiffPlugin)
                        if err != nil {
                                return nil, err
                        }

                        orderedNames := ic.Config.(*config).Order
                        ordered := make([]differ, len(orderedNames))
                        for i, n := range orderedNames {
                                d, ok := differs[n]
                                if !ok {
                                        return nil, fmt.Errorf("needed differ not loaded: %s", n)
                                }

                                ordered[i], ok = d.(differ)
                                if !ok {
                                        return nil, fmt.Errorf("differ does not implement Comparer and Applier interface: %s", n)
                                }
                        }

                        return &local{
                                differs: ordered,
                        }, nil
                },
        })
}

type local struct {
        differs []differ
}

var _ diffapi.DiffClient = &local{}

func (l *local) Apply(ctx context.Context, er *diffapi.ApplyRequest, _ ...grpc.CallOption) (*diffapi.ApplyResponse, error) {
        var (
                ocidesc ocispec.Descriptor
                err     error
                desc    = oci.DescriptorFromProto(er.Diff)
                mounts  = mount.FromProto(er.Mounts)
        )

        var opts []diff.ApplyOpt
        if er.Payloads != nil {
                payloads := make(map[string]typeurl.Any)
                for k, v := range er.Payloads {
                        payloads[k] = v
                }
                opts = append(opts, diff.WithPayloads(payloads))
        }
        opts = append(opts, diff.WithSyncFs(er.SyncFs))

        for _, differ := range l.differs {
                ocidesc, err = differ.Apply(ctx, desc, mounts, opts...)
                if !errdefs.IsNotImplemented(err) {
                        break
                }
        }

        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &diffapi.ApplyResponse{
                Applied: oci.DescriptorToProto(ocidesc),
        }, nil

}

func (l *local) Diff(ctx context.Context, dr *diffapi.DiffRequest, _ ...grpc.CallOption) (*diffapi.DiffResponse, error) {
        var (
                ocidesc ocispec.Descriptor
                err     error
                aMounts = mount.FromProto(dr.Left)
                bMounts = mount.FromProto(dr.Right)
        )

        var opts []diff.Opt
        if dr.MediaType != "" {
                opts = append(opts, diff.WithMediaType(dr.MediaType))
        }
        if dr.Ref != "" {
                opts = append(opts, diff.WithReference(dr.Ref))
        }
        if dr.Labels != nil {
                opts = append(opts, diff.WithLabels(dr.Labels))
        }
        if dr.SourceDateEpoch != nil {
                tm := dr.SourceDateEpoch.AsTime()
                opts = append(opts, diff.WithSourceDateEpoch(&tm))
        }

        for _, d := range l.differs {
                ocidesc, err = d.Compare(ctx, aMounts, bMounts, opts...)
                if !errdefs.IsNotImplemented(err) {
                        break
                }
        }
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &diffapi.DiffResponse{
                Diff: oci.DescriptorToProto(ocidesc),
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package diff

import (
        "context"

        diffapi "github.com/containerd/containerd/api/services/diff/v1"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "diff",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.ServicePlugin, services.DiffService)
                        if err != nil {
                                return nil, err
                        }
                        return &service{local: i.(diffapi.DiffClient)}, nil
                },
        })
}

type service struct {
        local diffapi.DiffClient
        diffapi.UnimplementedDiffServer
}

var _ diffapi.DiffServer = &service{}

func (s *service) Register(gs *grpc.Server) error {
        diffapi.RegisterDiffServer(gs, s)
        return nil
}

func (s *service) Apply(ctx context.Context, er *diffapi.ApplyRequest) (*diffapi.ApplyResponse, error) {
        return s.local.Apply(ctx, er)
}

func (s *service) Diff(ctx context.Context, dr *diffapi.DiffRequest) (*diffapi.DiffResponse, error) {
        return s.local.Diff(ctx, dr)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package events

import (
        "context"
        "fmt"

        api "github.com/containerd/containerd/api/services/events/v1"
        apittrpc "github.com/containerd/containerd/api/services/ttrpc/events/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/ttrpc"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "events",
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        ep, err := ic.GetByID(plugins.EventPlugin, "exchange")
                        if err != nil {
                                return nil, err
                        }
                        return NewService(ep.(*exchange.Exchange)), nil
                },
        })
}

type service struct {
        ttService *ttrpcService
        events    *exchange.Exchange
        api.UnimplementedEventsServer
}

// NewService returns the GRPC events server
func NewService(events *exchange.Exchange) api.EventsServer {
        return &service{
                ttService: &ttrpcService{
                        events: events,
                },
                events: events,
        }
}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterEventsServer(server, s)
        return nil
}

func (s *service) RegisterTTRPC(server *ttrpc.Server) error {
        apittrpc.RegisterEventsService(server, s.ttService)
        return nil
}

func (s *service) Publish(ctx context.Context, r *api.PublishRequest) (*ptypes.Empty, error) {
        if err := s.events.Publish(ctx, r.Topic, r.Event); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &ptypes.Empty{}, nil
}

func (s *service) Forward(ctx context.Context, r *api.ForwardRequest) (*ptypes.Empty, error) {
        if err := s.events.Forward(ctx, fromProto(r.Envelope)); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &ptypes.Empty{}, nil
}

func (s *service) Subscribe(req *api.SubscribeRequest, srv api.Events_SubscribeServer) error {
        ctx, cancel := context.WithCancel(srv.Context())
        defer cancel()

        eventq, errq := s.events.Subscribe(ctx, req.Filters...)
        for {
                select {
                case ev := <-eventq:
                        if err := srv.Send(toProto(ev)); err != nil {
                                return fmt.Errorf("failed sending event to subscriber: %w", err)
                        }
                case err := <-errq:
                        if err != nil {
                                return fmt.Errorf("subscription error: %w", err)
                        }

                        return nil
                }
        }
}

func toProto(env *events.Envelope) *types.Envelope {
        return &types.Envelope{
                Timestamp: protobuf.ToTimestamp(env.Timestamp),
                Namespace: env.Namespace,
                Topic:     env.Topic,
                Event:     protobuf.FromAny(env.Event),
        }
}

func fromProto(env *types.Envelope) *events.Envelope {
        return &events.Envelope{
                Timestamp: protobuf.FromTimestamp(env.Timestamp),
                Namespace: env.Namespace,
                Topic:     env.Topic,
                Event:     env.Event,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package events

import (
        "context"

        api "github.com/containerd/containerd/api/services/ttrpc/events/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/events/exchange"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/errdefs"
)

type ttrpcService struct {
        events *exchange.Exchange
}

func (s *ttrpcService) Forward(ctx context.Context, r *api.ForwardRequest) (*ptypes.Empty, error) {
        if err := s.events.Forward(ctx, fromTProto(r.Envelope)); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &ptypes.Empty{}, nil
}

func fromTProto(env *types.Envelope) *events.Envelope {
        return &events.Envelope{
                Timestamp: protobuf.FromTimestamp(env.Timestamp),
                Namespace: env.Namespace,
                Topic:     env.Topic,
                Event:     env.Event,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package healthcheck

import (
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"

        "google.golang.org/grpc"
        "google.golang.org/grpc/health"
        "google.golang.org/grpc/health/grpc_health_v1"
)

type service struct {
        serve *health.Server
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "healthcheck",
                InitFn: func(*plugin.InitContext) (interface{}, error) {
                        return newService()
                },
        })
}

func newService() (*service, error) {
        return &service{
                health.NewServer(),
        }, nil
}

func (s *service) Register(server *grpc.Server) error {
        grpc_health_v1.RegisterHealthServer(server, s.serve)
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        imagesapi "github.com/containerd/containerd/api/services/images/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func imagesToProto(images []images.Image) []*imagesapi.Image {
        var imagespb []*imagesapi.Image

        for _, image := range images {
                image := image
                imagespb = append(imagespb, imageToProto(&image))
        }

        return imagespb
}

func imageToProto(image *images.Image) *imagesapi.Image {
        return &imagesapi.Image{
                Name:      image.Name,
                Labels:    image.Labels,
                Target:    descToProto(&image.Target),
                CreatedAt: protobuf.ToTimestamp(image.CreatedAt),
                UpdatedAt: protobuf.ToTimestamp(image.UpdatedAt),
        }
}

func imageFromProto(imagepb *imagesapi.Image) images.Image {
        return images.Image{
                Name:      imagepb.Name,
                Labels:    imagepb.Labels,
                Target:    descFromProto(imagepb.Target),
                CreatedAt: protobuf.FromTimestamp(imagepb.CreatedAt),
                UpdatedAt: protobuf.FromTimestamp(imagepb.UpdatedAt),
        }
}

func descFromProto(desc *types.Descriptor) ocispec.Descriptor {
        return ocispec.Descriptor{
                MediaType:   desc.MediaType,
                Size:        desc.Size,
                Digest:      digest.Digest(desc.Digest),
                Annotations: desc.Annotations,
        }
}

func descToProto(desc *ocispec.Descriptor) *types.Descriptor {
        return &types.Descriptor{
                MediaType:   desc.MediaType,
                Size:        desc.Size,
                Digest:      desc.Digest.String(),
                Annotations: desc.Annotations,
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"

        "github.com/containerd/log"
        "google.golang.org/grpc"
        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"

        imagesapi "github.com/containerd/containerd/api/services/images/v1"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/pkg/deprecation"
        "github.com/containerd/containerd/v2/pkg/epoch"
        "github.com/containerd/containerd/v2/pkg/gc"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/containerd/v2/plugins/services/warning"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ServicePlugin,
                ID:   services.ImagesService,
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                        plugins.GCPlugin,
                        plugins.WarningPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        g, err := ic.GetSingle(plugins.GCPlugin)
                        if err != nil {
                                return nil, err
                        }
                        w, err := ic.GetSingle(plugins.WarningPlugin)
                        if err != nil {
                                return nil, err
                        }

                        return &local{
                                store:    metadata.NewImageStore(m.(*metadata.DB)),
                                gc:       g.(gcScheduler),
                                warnings: w.(warning.Service),
                        }, nil
                },
        })
}

type gcScheduler interface {
        ScheduleAndWait(context.Context) (gc.Stats, error)
}

type local struct {
        store    images.Store
        gc       gcScheduler
        warnings warning.Service
}

var _ imagesapi.ImagesClient = &local{}

func (l *local) Get(ctx context.Context, req *imagesapi.GetImageRequest, _ ...grpc.CallOption) (*imagesapi.GetImageResponse, error) {
        image, err := l.store.Get(ctx, req.Name)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        imagepb := imageToProto(&image)
        return &imagesapi.GetImageResponse{
                Image: imagepb,
        }, nil
}

func (l *local) List(ctx context.Context, req *imagesapi.ListImagesRequest, _ ...grpc.CallOption) (*imagesapi.ListImagesResponse, error) {
        images, err := l.store.List(ctx, req.Filters...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &imagesapi.ListImagesResponse{
                Images: imagesToProto(images),
        }, nil
}

func (l *local) Create(ctx context.Context, req *imagesapi.CreateImageRequest, _ ...grpc.CallOption) (*imagesapi.CreateImageResponse, error) {
        log.G(ctx).WithField("name", req.Image.Name).WithField("target", req.Image.Target.Digest).Debugf("create image")
        if req.Image.Name == "" {
                return nil, status.Errorf(codes.InvalidArgument, "Image.Name required")
        }

        var (
                image = imageFromProto(req.Image)
                resp  imagesapi.CreateImageResponse
        )
        if req.SourceDateEpoch != nil {
                tm := req.SourceDateEpoch.AsTime()
                ctx = epoch.WithSourceDateEpoch(ctx, &tm)
        }
        created, err := l.store.Create(ctx, image)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        resp.Image = imageToProto(&created)

        l.emitSchema1DeprecationWarning(ctx, &image)
        return &resp, nil

}

func (l *local) Update(ctx context.Context, req *imagesapi.UpdateImageRequest, _ ...grpc.CallOption) (*imagesapi.UpdateImageResponse, error) {
        if req.Image.Name == "" {
                return nil, status.Errorf(codes.InvalidArgument, "Image.Name required")
        }

        var (
                image      = imageFromProto(req.Image)
                resp       imagesapi.UpdateImageResponse
                fieldpaths []string
        )

        if req.UpdateMask != nil && len(req.UpdateMask.Paths) > 0 {
                fieldpaths = append(fieldpaths, req.UpdateMask.Paths...)
        }

        if req.SourceDateEpoch != nil {
                tm := req.SourceDateEpoch.AsTime()
                ctx = epoch.WithSourceDateEpoch(ctx, &tm)
        }

        updated, err := l.store.Update(ctx, image, fieldpaths...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        resp.Image = imageToProto(&updated)

        l.emitSchema1DeprecationWarning(ctx, &image)
        return &resp, nil
}

func (l *local) Delete(ctx context.Context, req *imagesapi.DeleteImageRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        log.G(ctx).WithField("name", req.Name).Debugf("delete image")

        var opts []images.DeleteOpt
        if req.Target != nil {
                desc := descFromProto(req.Target)
                opts = append(opts, images.DeleteTarget(&desc))
        }

        // Sync option handled here after event is published
        if err := l.store.Delete(ctx, req.Name, opts...); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        if req.Sync {
                if _, err := l.gc.ScheduleAndWait(ctx); err != nil {
                        return nil, err
                }
        }

        return &ptypes.Empty{}, nil
}

func (l *local) emitSchema1DeprecationWarning(ctx context.Context, image *images.Image) {
        if image == nil {
                return
        }
        dgst, ok := image.Labels[images.ConvertedDockerSchema1LabelKey]
        if !ok {
                return
        }
        log.G(ctx).WithField("name", image.Name).WithField("schema1digest", dgst).Warn("conversion from schema 1 images is deprecated")
        l.warnings.Emit(ctx, deprecation.PullSchema1Image)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package images

import (
        "context"

        imagesapi "github.com/containerd/containerd/api/services/images/v1"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "images",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.ServicePlugin, services.ImagesService)
                        if err != nil {
                                return nil, err
                        }
                        return &service{local: i.(imagesapi.ImagesClient)}, nil
                },
        })
}

type service struct {
        local imagesapi.ImagesClient
        imagesapi.UnimplementedImagesServer
}

var _ imagesapi.ImagesServer = &service{}

func (s *service) Register(server *grpc.Server) error {
        imagesapi.RegisterImagesServer(server, s)
        return nil
}

func (s *service) Get(ctx context.Context, req *imagesapi.GetImageRequest) (*imagesapi.GetImageResponse, error) {
        return s.local.Get(ctx, req)
}

func (s *service) List(ctx context.Context, req *imagesapi.ListImagesRequest) (*imagesapi.ListImagesResponse, error) {
        return s.local.List(ctx, req)
}

func (s *service) Create(ctx context.Context, req *imagesapi.CreateImageRequest) (*imagesapi.CreateImageResponse, error) {
        return s.local.Create(ctx, req)
}

func (s *service) Update(ctx context.Context, req *imagesapi.UpdateImageRequest) (*imagesapi.UpdateImageResponse, error) {
        return s.local.Update(ctx, req)
}

func (s *service) Delete(ctx context.Context, req *imagesapi.DeleteImageRequest) (*ptypes.Empty, error) {
        return s.local.Delete(ctx, req)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package introspection

import (
        context "context"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "runtime"
        "sync"

        "github.com/google/uuid"
        "google.golang.org/genproto/googleapis/rpc/code"
        rpc "google.golang.org/genproto/googleapis/rpc/status"
        "google.golang.org/grpc/status"

        api "github.com/containerd/containerd/api/services/introspection/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/introspection"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/containerd/v2/plugins/services/warning"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
)

func init() {
        registry.Register(&plugin.Registration{
                Type:     plugins.ServicePlugin,
                ID:       services.IntrospectionService,
                Requires: []plugin.Type{plugins.WarningPlugin},
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.WarningPlugin, plugins.DeprecationsPlugin)
                        if err != nil {
                                return nil, err
                        }

                        warningClient, ok := i.(warning.Service)
                        if !ok {
                                return nil, errors.New("could not create a local client for warning service")
                        }

                        // this service fetches all plugins through the plugin set of the plugin context
                        return &Local{
                                plugins:       ic.Plugins(),
                                root:          ic.Properties[plugins.PropertyRootDir],
                                warningClient: warningClient,
                        }, nil
                },
        })
}

// Local is a local implementation of the introspection service
type Local struct {
        mu            sync.Mutex
        root          string
        plugins       *plugin.Set
        pluginCache   []*api.Plugin
        warningClient warning.Service
}

var _ = (introspection.Service)(&Local{})

// UpdateLocal updates the local introspection service
func (l *Local) UpdateLocal(root string) {
        l.mu.Lock()
        defer l.mu.Unlock()
        l.root = root
}

// Plugins returns the locally defined plugins
func (l *Local) Plugins(ctx context.Context, fs ...string) (*api.PluginsResponse, error) {
        filter, err := filters.ParseAll(fs...)
        if err != nil {
                return nil, fmt.Errorf("%w: %w", errdefs.ErrInvalidArgument, err)
        }

        var plugins []*api.Plugin
        allPlugins := l.getPlugins()
        for _, p := range allPlugins {
                p := p
                if filter.Match(adaptPlugin(p)) {
                        plugins = append(plugins, p)
                }
        }

        return &api.PluginsResponse{
                Plugins: plugins,
        }, nil
}

func (l *Local) getPlugins() []*api.Plugin {
        l.mu.Lock()
        defer l.mu.Unlock()
        plugins := l.plugins.GetAll()
        if l.pluginCache == nil || len(plugins) != len(l.pluginCache) {
                l.pluginCache = pluginsToPB(plugins)
        }
        return l.pluginCache
}

// Server returns the local server information
func (l *Local) Server(ctx context.Context) (*api.ServerResponse, error) {
        u, err := l.getUUID()
        if err != nil {
                return nil, err
        }
        pid := os.Getpid()
        var pidns uint64
        if runtime.GOOS == "linux" {
                pidns, err = statPIDNS(pid)
                if err != nil {
                        return nil, err
                }
        }
        return &api.ServerResponse{
                UUID:         u,
                Pid:          uint64(pid),
                Pidns:        pidns,
                Deprecations: l.getWarnings(ctx),
        }, nil
}

func (l *Local) getUUID() (string, error) {
        l.mu.Lock()
        defer l.mu.Unlock()

        data, err := os.ReadFile(l.uuidPath())
        if err != nil {
                if os.IsNotExist(err) {
                        return l.generateUUID()
                }
                return "", err
        }
        u := string(data)
        if _, err := uuid.Parse(u); err != nil {
                return "", err
        }
        return u, nil
}

func (l *Local) generateUUID() (string, error) {
        u, err := uuid.NewRandom()
        if err != nil {
                return "", err
        }
        path := l.uuidPath()
        if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil {
                return "", err
        }
        uu := u.String()
        if err := os.WriteFile(path, []byte(uu), 0666); err != nil {
                return "", err
        }
        return uu, nil
}

func (l *Local) uuidPath() string {
        return filepath.Join(l.root, "uuid")
}

func (l *Local) getWarnings(ctx context.Context) []*api.DeprecationWarning {
        return warningsPB(ctx, l.warningClient.Warnings())
}

func adaptPlugin(o interface{}) filters.Adaptor {
        obj := o.(*api.Plugin)
        return filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                if len(fieldpath) == 0 {
                        return "", false
                }

                switch fieldpath[0] {
                case "type":
                        return obj.Type, len(obj.Type) > 0
                case "id":
                        return obj.ID, len(obj.ID) > 0
                case "platforms":
                        // TODO(stevvooe): Another case here where have multiple values.
                        // May need to refactor the filter system to allow filtering by
                        // platform, if this is required.
                case "capabilities":
                        // TODO(stevvooe): Need a better way to match against
                        // collections. We can only return "the value" but really it
                        // would be best if we could return a set of values for the
                        // path, any of which could match.
                }

                return "", false
        })
}

func pluginToPB(p *plugin.Plugin) *api.Plugin {
        var requires []string
        for _, r := range p.Registration.Requires {
                requires = append(requires, r.String())
        }

        var initErr *rpc.Status
        if err := p.Err(); err != nil {
                st, ok := status.FromError(errdefs.ToGRPC(err))
                if ok {
                        var details []*ptypes.Any
                        for _, d := range st.Proto().Details {
                                details = append(details, &ptypes.Any{
                                        TypeUrl: d.TypeUrl,
                                        Value:   d.Value,
                                })
                        }
                        initErr = &rpc.Status{
                                Code:    int32(st.Code()),
                                Message: st.Message(),
                                Details: details,
                        }
                } else {
                        initErr = &rpc.Status{
                                Code:    int32(code.Code_UNKNOWN),
                                Message: err.Error(),
                        }
                }
        }

        return &api.Plugin{
                Type:         p.Registration.Type.String(),
                ID:           p.Registration.ID,
                Requires:     requires,
                Platforms:    types.OCIPlatformToProto(p.Meta.Platforms),
                Capabilities: p.Meta.Capabilities,
                Exports:      p.Meta.Exports,
                InitErr:      initErr,
        }
}

func pluginsToPB(plugins []*plugin.Plugin) []*api.Plugin {
        pluginsPB := make([]*api.Plugin, 0, len(plugins))
        for _, p := range plugins {
                pluginsPB = append(pluginsPB, pluginToPB(p))
        }

        return pluginsPB
}

func warningsPB(ctx context.Context, warnings []warning.Warning) []*api.DeprecationWarning {
        var pb []*api.DeprecationWarning

        for _, w := range warnings {
                pb = append(pb, &api.DeprecationWarning{
                        ID:             string(w.ID),
                        Message:        w.Message,
                        LastOccurrence: protobuf.ToTimestamp(w.LastOccurrence),
                })
        }
        return pb
}

type pluginInfoProvider interface {
        PluginInfo(context.Context, interface{}) (interface{}, error)
}

func (l *Local) PluginInfo(ctx context.Context, pluginType, id string, options any) (*api.PluginInfoResponse, error) {
        p := l.plugins.Get(plugin.Type(pluginType), id)
        if p == nil {
                return nil, fmt.Errorf("plugin %s.%s not found: %w", pluginType, id, errdefs.ErrNotFound)
        }

        resp := &api.PluginInfoResponse{
                Plugin: pluginToPB(p),
        }

        // Request additional info from plugin instance
        if options != nil {
                if p.Err() != nil {
                        return resp, fmt.Errorf("cannot get extra info, plugin not successfully loaded: %w", errdefs.ErrFailedPrecondition)
                }
                inst, err := p.Instance()
                if err != nil {
                        return resp, fmt.Errorf("failed to get plugin instance: %w", errdefs.ErrFailedPrecondition)
                }
                pi, ok := inst.(pluginInfoProvider)
                if !ok {
                        return resp, fmt.Errorf("plugin does not provided extra information: %w", errdefs.ErrNotImplemented)
                }

                info, err := pi.PluginInfo(ctx, options)
                if err != nil {
                        return resp, errdefs.ToGRPC(err)
                }
                ai, err := typeurl.MarshalAny(info)
                if err != nil {
                        return resp, fmt.Errorf("failed to marshal plugin info: %w", err)
                }
                resp.Extra = &ptypes.Any{
                        TypeUrl: ai.GetTypeUrl(),
                        Value:   ai.GetValue(),
                }
        }
        return resp, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package introspection

import (
        "fmt"
        "os"
        "syscall"
)

func statPIDNS(pid int) (uint64, error) {
        f := fmt.Sprintf("/proc/%d/ns/pid", pid)
        st, err := os.Stat(f)
        if err != nil {
                return 0, err
        }
        stSys, ok := st.Sys().(*syscall.Stat_t)
        if !ok {
                return 0, fmt.Errorf("%T is not *syscall.Stat_t", st.Sys())
        }
        return stSys.Ino, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package introspection

import (
        context "context"
        "errors"
        "fmt"

        api "github.com/containerd/containerd/api/services/introspection/v1"
        "github.com/containerd/containerd/v2/core/introspection"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type:     plugins.GRPCPlugin,
                ID:       "introspection",
                Requires: []plugin.Type{plugins.ServicePlugin},
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.ServicePlugin, services.IntrospectionService)
                        if err != nil {
                                return nil, err
                        }

                        localClient, ok := i.(*Local)
                        if !ok {
                                return nil, errors.New("could not create a local client for introspection service")
                        }
                        localClient.UpdateLocal(ic.Properties[plugins.PropertyRootDir])

                        return &server{
                                local: localClient,
                        }, nil
                },
        })
}

type server struct {
        local introspection.Service
        api.UnimplementedIntrospectionServer
}

var _ = (api.IntrospectionServer)(&server{})

func (s *server) Register(server *grpc.Server) error {
        api.RegisterIntrospectionServer(server, s)
        return nil
}

func (s *server) Plugins(ctx context.Context, req *api.PluginsRequest) (resp *api.PluginsResponse, err error) {
        resp, err = s.local.Plugins(ctx, req.Filters...)
        return resp, errdefs.ToGRPC(err)
}

func (s *server) Server(ctx context.Context, _ *ptypes.Empty) (resp *api.ServerResponse, err error) {
        resp, err = s.local.Server(ctx)
        return resp, errdefs.ToGRPC(err)
}

func (s *server) PluginInfo(ctx context.Context, req *api.PluginInfoRequest) (resp *api.PluginInfoResponse, err error) {
        var options any
        if req.Options != nil {
                options, err = typeurl.UnmarshalAny(req.Options)
                if err != nil {
                        return resp, errdefs.ToGRPC(fmt.Errorf("failed to unmarshal plugin info Options: %w", err))
                }
        }

        resp, err = s.local.PluginInfo(ctx, req.Type, req.ID, options)
        return resp, errdefs.ToGRPC(err)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package leases

import (
        "context"

        api "github.com/containerd/containerd/api/services/leases/v1"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "leases",
                Requires: []plugin.Type{
                        plugins.LeasePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.LeasePlugin, "manager")
                        if err != nil {
                                return nil, err
                        }
                        return &service{lm: i.(leases.Manager)}, nil
                },
        })
}

type service struct {
        lm leases.Manager
        api.UnimplementedLeasesServer
}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterLeasesServer(server, s)
        return nil
}

func (s *service) Create(ctx context.Context, r *api.CreateRequest) (*api.CreateResponse, error) {
        opts := []leases.Opt{
                leases.WithLabels(r.Labels),
        }
        if r.ID == "" {
                opts = append(opts, leases.WithRandomID())
        } else {
                opts = append(opts, leases.WithID(r.ID))
        }

        l, err := s.lm.Create(ctx, opts...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.CreateResponse{
                Lease: leaseToGRPC(l),
        }, nil
}

func (s *service) Delete(ctx context.Context, r *api.DeleteRequest) (*ptypes.Empty, error) {
        var opts []leases.DeleteOpt
        if r.Sync {
                opts = append(opts, leases.SynchronousDelete)
        }
        if err := s.lm.Delete(ctx, leases.Lease{
                ID: r.ID,
        }, opts...); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &ptypes.Empty{}, nil
}

func (s *service) List(ctx context.Context, r *api.ListRequest) (*api.ListResponse, error) {
        l, err := s.lm.List(ctx, r.Filters...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        apileases := make([]*api.Lease, len(l))
        for i := range l {
                apileases[i] = leaseToGRPC(l[i])
        }

        return &api.ListResponse{
                Leases: apileases,
        }, nil
}

func (s *service) AddResource(ctx context.Context, r *api.AddResourceRequest) (*ptypes.Empty, error) {
        lease := leases.Lease{
                ID: r.ID,
        }

        if err := s.lm.AddResource(ctx, lease, leases.Resource{
                ID:   r.Resource.ID,
                Type: r.Resource.Type,
        }); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &ptypes.Empty{}, nil
}

func (s *service) DeleteResource(ctx context.Context, r *api.DeleteResourceRequest) (*ptypes.Empty, error) {
        lease := leases.Lease{
                ID: r.ID,
        }

        if err := s.lm.DeleteResource(ctx, lease, leases.Resource{
                ID:   r.Resource.ID,
                Type: r.Resource.Type,
        }); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &ptypes.Empty{}, nil
}

func (s *service) ListResources(ctx context.Context, r *api.ListResourcesRequest) (*api.ListResourcesResponse, error) {
        lease := leases.Lease{
                ID: r.ID,
        }

        rs, err := s.lm.ListResources(ctx, lease)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        apiResources := make([]*api.Resource, 0, len(rs))
        for _, i := range rs {
                apiResources = append(apiResources, &api.Resource{
                        ID:   i.ID,
                        Type: i.Type,
                })
        }
        return &api.ListResourcesResponse{
                Resources: apiResources,
        }, nil
}

func leaseToGRPC(l leases.Lease) *api.Lease {
        return &api.Lease{
                ID:        l.ID,
                Labels:    l.Labels,
                CreatedAt: protobuf.ToTimestamp(l.CreatedAt),
        }
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package namespaces

import (
        "context"
        "strings"

        eventstypes "github.com/containerd/containerd/api/events"
        api "github.com/containerd/containerd/api/services/namespaces/v1"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        bolt "go.etcd.io/bbolt"
        "google.golang.org/grpc"
        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ServicePlugin,
                ID:   services.NamespacesService,
                Requires: []plugin.Type{
                        plugins.EventPlugin,
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        ep, err := ic.GetSingle(plugins.EventPlugin)
                        if err != nil {
                                return nil, err
                        }
                        return &local{
                                db:        m.(*metadata.DB),
                                publisher: ep.(events.Publisher),
                        }, nil
                },
        })
}

// Provide local namespaces service instead of local namespace store,
// because namespace store interface doesn't provide enough functionality
// for namespaces service.
type local struct {
        db        *metadata.DB
        publisher events.Publisher
}

var _ api.NamespacesClient = &local{}

func (l *local) Get(ctx context.Context, req *api.GetNamespaceRequest, _ ...grpc.CallOption) (*api.GetNamespaceResponse, error) {
        var resp api.GetNamespaceResponse

        return &resp, l.withStoreView(ctx, func(ctx context.Context, store namespaces.Store) error {
                labels, err := store.Labels(ctx, req.Name)
                if err != nil {
                        return errdefs.ToGRPC(err)
                }

                resp.Namespace = &api.Namespace{
                        Name:   req.Name,
                        Labels: labels,
                }

                return nil
        })
}

func (l *local) List(ctx context.Context, req *api.ListNamespacesRequest, _ ...grpc.CallOption) (*api.ListNamespacesResponse, error) {
        var resp api.ListNamespacesResponse

        return &resp, l.withStoreView(ctx, func(ctx context.Context, store namespaces.Store) error {
                namespaces, err := store.List(ctx)
                if err != nil {
                        return err
                }

                for _, namespace := range namespaces {
                        labels, err := store.Labels(ctx, namespace)
                        if err != nil {
                                // In general, this should be unlikely, since we are holding a
                                // transaction to service this request.
                                return errdefs.ToGRPC(err)
                        }

                        resp.Namespaces = append(resp.Namespaces, &api.Namespace{
                                Name:   namespace,
                                Labels: labels,
                        })
                }

                return nil
        })
}

func (l *local) Create(ctx context.Context, req *api.CreateNamespaceRequest, _ ...grpc.CallOption) (*api.CreateNamespaceResponse, error) {
        var resp api.CreateNamespaceResponse

        if err := l.withStoreUpdate(ctx, func(ctx context.Context, store namespaces.Store) error {
                if err := store.Create(ctx, req.Namespace.Name, req.Namespace.Labels); err != nil {
                        return errdefs.ToGRPC(err)
                }

                for k, v := range req.Namespace.Labels {
                        if err := store.SetLabel(ctx, req.Namespace.Name, k, v); err != nil {
                                return err
                        }
                }

                resp.Namespace = req.Namespace
                return nil
        }); err != nil {
                return &resp, err
        }

        ctx = namespaces.WithNamespace(ctx, req.Namespace.Name)
        if err := l.publisher.Publish(ctx, "/namespaces/create", &eventstypes.NamespaceCreate{
                Name:   req.Namespace.Name,
                Labels: req.Namespace.Labels,
        }); err != nil {
                return &resp, err
        }

        return &resp, nil

}

func (l *local) Update(ctx context.Context, req *api.UpdateNamespaceRequest, _ ...grpc.CallOption) (*api.UpdateNamespaceResponse, error) {
        var resp api.UpdateNamespaceResponse
        if err := l.withStoreUpdate(ctx, func(ctx context.Context, store namespaces.Store) error {
                if req.UpdateMask != nil && len(req.UpdateMask.Paths) > 0 {
                        for _, path := range req.UpdateMask.Paths {
                                switch {
                                case strings.HasPrefix(path, "labels."):
                                        key := strings.TrimPrefix(path, "labels.")
                                        if err := store.SetLabel(ctx, req.Namespace.Name, key, req.Namespace.Labels[key]); err != nil {
                                                return err
                                        }
                                default:
                                        return status.Errorf(codes.InvalidArgument, "cannot update %q field", path)
                                }
                        }
                } else {
                        // clear out the existing labels and then set them to the incoming request.
                        // get current set of labels
                        labels, err := store.Labels(ctx, req.Namespace.Name)
                        if err != nil {
                                return errdefs.ToGRPC(err)
                        }

                        for k := range labels {
                                if err := store.SetLabel(ctx, req.Namespace.Name, k, ""); err != nil {
                                        return err
                                }
                        }

                        for k, v := range req.Namespace.Labels {
                                if err := store.SetLabel(ctx, req.Namespace.Name, k, v); err != nil {
                                        return err
                                }

                        }
                }

                return nil
        }); err != nil {
                return &resp, err
        }

        ctx = namespaces.WithNamespace(ctx, req.Namespace.Name)
        if err := l.publisher.Publish(ctx, "/namespaces/update", &eventstypes.NamespaceUpdate{
                Name:   req.Namespace.Name,
                Labels: req.Namespace.Labels,
        }); err != nil {
                return &resp, err
        }

        return &resp, nil
}

func (l *local) Delete(ctx context.Context, req *api.DeleteNamespaceRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        if err := l.withStoreUpdate(ctx, func(ctx context.Context, store namespaces.Store) error {
                return errdefs.ToGRPC(store.Delete(ctx, req.Name))
        }); err != nil {
                return &ptypes.Empty{}, err
        }
        // set the namespace in the context before publishing the event
        ctx = namespaces.WithNamespace(ctx, req.Name)
        if err := l.publisher.Publish(ctx, "/namespaces/delete", &eventstypes.NamespaceDelete{
                Name: req.Name,
        }); err != nil {
                return &ptypes.Empty{}, err
        }

        return &ptypes.Empty{}, nil
}

func (l *local) withStore(ctx context.Context, fn func(ctx context.Context, store namespaces.Store) error) func(tx *bolt.Tx) error {
        return func(tx *bolt.Tx) error { return fn(ctx, metadata.NewNamespaceStore(tx)) }
}

func (l *local) withStoreView(ctx context.Context, fn func(ctx context.Context, store namespaces.Store) error) error {
        return l.db.View(l.withStore(ctx, fn))
}

func (l *local) withStoreUpdate(ctx context.Context, fn func(ctx context.Context, store namespaces.Store) error) error {
        return l.db.Update(l.withStore(ctx, fn))
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package namespaces

import (
        "context"

        api "github.com/containerd/containerd/api/services/namespaces/v1"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "namespaces",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.ServicePlugin, services.NamespacesService)
                        if err != nil {
                                return nil, err
                        }
                        return &service{local: i.(api.NamespacesClient)}, nil
                },
        })
}

type service struct {
        local api.NamespacesClient
        api.UnimplementedNamespacesServer
}

var _ api.NamespacesServer = &service{}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterNamespacesServer(server, s)
        return nil
}

func (s *service) Get(ctx context.Context, req *api.GetNamespaceRequest) (*api.GetNamespaceResponse, error) {
        return s.local.Get(ctx, req)
}

func (s *service) List(ctx context.Context, req *api.ListNamespacesRequest) (*api.ListNamespacesResponse, error) {
        return s.local.List(ctx, req)
}

func (s *service) Create(ctx context.Context, req *api.CreateNamespaceRequest) (*api.CreateNamespaceResponse, error) {
        return s.local.Create(ctx, req)
}

func (s *service) Update(ctx context.Context, req *api.UpdateNamespaceRequest) (*api.UpdateNamespaceResponse, error) {
        return s.local.Update(ctx, req)
}

func (s *service) Delete(ctx context.Context, req *api.DeleteNamespaceRequest) (*ptypes.Empty, error) {
        return s.local.Delete(ctx, req)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package opt

import (
        "fmt"
        "os"
        "path/filepath"

        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

// Config for the opt manager
type Config struct {
        // Path for the opt directory
        Path string `toml:"path"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.InternalPlugin,
                ID:   "opt",
                Config: &Config{
                        Path: defaultPath,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        path := ic.Config.(*Config).Path
                        ic.Meta.Exports["path"] = path
                        bin := filepath.Join(path, "bin")
                        if err := os.MkdirAll(bin, 0711); err != nil {
                                return nil, err
                        }
                        if err := os.Setenv("PATH", fmt.Sprintf("%s%c%s", bin, os.PathListSeparator, os.Getenv("PATH"))); err != nil {
                                return nil, fmt.Errorf("set binary image directory in path %s: %w", bin, err)
                        }

                        lib := filepath.Join(path, "lib")
                        if err := os.MkdirAll(lib, 0711); err != nil {
                                return nil, err
                        }
                        if err := os.Setenv("LD_LIBRARY_PATH", fmt.Sprintf("%s%c%s", lib, os.PathListSeparator, os.Getenv("LD_LIBRARY_PATH"))); err != nil {
                                return nil, fmt.Errorf("set binary lib directory in path %s: %w", lib, err)
                        }
                        return &manager{}, nil
                },
        })
}

type manager struct {
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "context"
        "fmt"
        "time"

        "google.golang.org/grpc"
        "google.golang.org/protobuf/types/known/anypb"

        eventtypes "github.com/containerd/containerd/api/events"
        api "github.com/containerd/containerd/api/services/sandbox/v1"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "sandbox-controllers",
                Requires: []plugin.Type{
                        plugins.SandboxControllerPlugin,
                        plugins.EventPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        sandboxers, err := ic.GetByType(plugins.SandboxControllerPlugin)
                        if err != nil {
                                return nil, err
                        }

                        sc := make(map[string]sandbox.Controller)
                        for name, p := range sandboxers {
                                sc[name] = p.(sandbox.Controller)
                        }

                        ep, err := ic.GetSingle(plugins.EventPlugin)
                        if err != nil {
                                return nil, err
                        }

                        return &controllerService{
                                sc:        sc,
                                publisher: ep.(events.Publisher),
                        }, nil
                },
        })
}

type controllerService struct {
        sc        map[string]sandbox.Controller
        publisher events.Publisher
        api.UnimplementedControllerServer
}

var _ api.ControllerServer = (*controllerService)(nil)

func (s *controllerService) Register(server *grpc.Server) error {
        api.RegisterControllerServer(server, s)
        return nil
}

func (s *controllerService) getController(name string) (sandbox.Controller, error) {
        if len(name) == 0 {
                return nil, fmt.Errorf("%w: sandbox controller name can not be empty", errdefs.ErrInvalidArgument)
        }
        if ctrl, ok := s.sc[name]; ok {
                return ctrl, nil
        }
        return nil, fmt.Errorf("%w: failed to get sandbox controller by %s", errdefs.ErrNotFound, name)
}

func (s *controllerService) Create(ctx context.Context, req *api.ControllerCreateRequest) (*api.ControllerCreateResponse, error) {
        log.G(ctx).WithField("req", req).Debug("create sandbox")
        // TODO: Rootfs
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        err = ctrl.Create(ctx, sandbox.Sandbox{ID: req.GetSandboxID()}, sandbox.WithOptions(req.GetOptions()))
        if err != nil {
                return &api.ControllerCreateResponse{}, errdefs.ToGRPC(err)
        }

        if err := s.publisher.Publish(ctx, "sandboxes/create", &eventtypes.SandboxCreate{
                SandboxID: req.GetSandboxID(),
        }); err != nil {
                return &api.ControllerCreateResponse{}, errdefs.ToGRPC(err)
        }

        return &api.ControllerCreateResponse{
                SandboxID: req.GetSandboxID(),
        }, nil
}

func (s *controllerService) Start(ctx context.Context, req *api.ControllerStartRequest) (*api.ControllerStartResponse, error) {
        log.G(ctx).WithField("req", req).Debug("start sandbox")
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        inst, err := ctrl.Start(ctx, req.GetSandboxID())
        if err != nil {
                return &api.ControllerStartResponse{}, errdefs.ToGRPC(err)
        }

        if err := s.publisher.Publish(ctx, "sandboxes/start", &eventtypes.SandboxStart{
                SandboxID: req.GetSandboxID(),
        }); err != nil {
                return &api.ControllerStartResponse{}, errdefs.ToGRPC(err)
        }

        return &api.ControllerStartResponse{
                SandboxID: inst.SandboxID,
                Pid:       inst.Pid,
                CreatedAt: protobuf.ToTimestamp(inst.CreatedAt),
                Labels:    inst.Labels,
        }, nil
}

func (s *controllerService) Stop(ctx context.Context, req *api.ControllerStopRequest) (*api.ControllerStopResponse, error) {
        log.G(ctx).WithField("req", req).Debug("delete sandbox")
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.ControllerStopResponse{}, errdefs.ToGRPC(ctrl.Stop(ctx, req.GetSandboxID(), sandbox.WithTimeout(time.Duration(req.TimeoutSecs)*time.Second)))
}

func (s *controllerService) Wait(ctx context.Context, req *api.ControllerWaitRequest) (*api.ControllerWaitResponse, error) {
        log.G(ctx).WithField("req", req).Debug("wait sandbox")
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        exitStatus, err := ctrl.Wait(ctx, req.GetSandboxID())
        if err != nil {
                return &api.ControllerWaitResponse{}, errdefs.ToGRPC(err)
        }

        if err := s.publisher.Publish(ctx, "sandboxes/exit", &eventtypes.SandboxExit{
                SandboxID:  req.GetSandboxID(),
                ExitStatus: exitStatus.ExitStatus,
                ExitedAt:   protobuf.ToTimestamp(exitStatus.ExitedAt),
        }); err != nil {
                return &api.ControllerWaitResponse{}, errdefs.ToGRPC(err)
        }

        return &api.ControllerWaitResponse{
                ExitStatus: exitStatus.ExitStatus,
                ExitedAt:   protobuf.ToTimestamp(exitStatus.ExitedAt),
        }, nil
}

func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatusRequest) (*api.ControllerStatusResponse, error) {
        log.G(ctx).WithField("req", req).Debug("sandbox status")
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        cstatus, err := ctrl.Status(ctx, req.GetSandboxID(), req.GetVerbose())
        if err != nil {
                return &api.ControllerStatusResponse{}, errdefs.ToGRPC(err)
        }
        extra := &anypb.Any{}
        if cstatus.Extra != nil {
                extra = &anypb.Any{
                        TypeUrl: cstatus.Extra.GetTypeUrl(),
                        Value:   cstatus.Extra.GetValue(),
                }
        }
        return &api.ControllerStatusResponse{
                SandboxID: cstatus.SandboxID,
                Pid:       cstatus.Pid,
                State:     cstatus.State,
                Info:      cstatus.Info,
                CreatedAt: protobuf.ToTimestamp(cstatus.CreatedAt),
                ExitedAt:  protobuf.ToTimestamp(cstatus.ExitedAt),
                Extra:     extra,
        }, nil
}

func (s *controllerService) Shutdown(ctx context.Context, req *api.ControllerShutdownRequest) (*api.ControllerShutdownResponse, error) {
        log.G(ctx).WithField("req", req).Debug("shutdown sandbox")
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.ControllerShutdownResponse{}, errdefs.ToGRPC(ctrl.Shutdown(ctx, req.GetSandboxID()))
}

func (s *controllerService) Metrics(ctx context.Context, req *api.ControllerMetricsRequest) (*api.ControllerMetricsResponse, error) {
        log.G(ctx).WithField("req", req).Debug("sandbox metrics")
        ctrl, err := s.getController(req.Sandboxer)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        metrics, err := ctrl.Metrics(ctx, req.GetSandboxID())
        if err != nil {
                return &api.ControllerMetricsResponse{}, errdefs.ToGRPC(err)
        }
        return &api.ControllerMetricsResponse{
                Metrics: metrics,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.SandboxStorePlugin,
                ID:   "local",
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }

                        return metadata.NewSandboxStore(m.(*metadata.DB)), nil
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package sandbox

import (
        "context"

        "google.golang.org/grpc"

        api "github.com/containerd/containerd/api/services/sandbox/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/v2/core/sandbox"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "sandboxes",
                Requires: []plugin.Type{
                        plugins.SandboxStorePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        sp, err := ic.GetByID(plugins.SandboxStorePlugin, "local")
                        if err != nil {
                                return nil, err
                        }

                        return &sandboxService{store: sp.(sandbox.Store)}, nil
                },
        })
}

type sandboxService struct {
        store sandbox.Store
        api.UnimplementedStoreServer
}

var _ api.StoreServer = (*sandboxService)(nil)

func (s *sandboxService) Register(server *grpc.Server) error {
        api.RegisterStoreServer(server, s)
        return nil
}

func (s *sandboxService) Create(ctx context.Context, req *api.StoreCreateRequest) (*api.StoreCreateResponse, error) {
        log.G(ctx).WithField("req", req).Debug("create sandbox")
        sb, err := s.store.Create(ctx, sandbox.FromProto(req.Sandbox))
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.StoreCreateResponse{Sandbox: sandbox.ToProto(&sb)}, nil
}

func (s *sandboxService) Update(ctx context.Context, req *api.StoreUpdateRequest) (*api.StoreUpdateResponse, error) {
        log.G(ctx).WithField("req", req).Debug("update sandbox")

        sb, err := s.store.Update(ctx, sandbox.FromProto(req.Sandbox), req.Fields...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.StoreUpdateResponse{Sandbox: sandbox.ToProto(&sb)}, nil
}

func (s *sandboxService) List(ctx context.Context, req *api.StoreListRequest) (*api.StoreListResponse, error) {
        log.G(ctx).WithField("req", req).Debug("list sandboxes")

        resp, err := s.store.List(ctx, req.Filters...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        list := make([]*types.Sandbox, len(resp))
        for i := range resp {
                list[i] = sandbox.ToProto(&resp[i])
        }

        return &api.StoreListResponse{List: list}, nil
}

func (s *sandboxService) Get(ctx context.Context, req *api.StoreGetRequest) (*api.StoreGetResponse, error) {
        log.G(ctx).WithField("req", req).Debug("get sandbox")
        resp, err := s.store.Get(ctx, req.SandboxID)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        desc := sandbox.ToProto(&resp)
        return &api.StoreGetResponse{Sandbox: desc}, nil
}

func (s *sandboxService) Delete(ctx context.Context, req *api.StoreDeleteRequest) (*api.StoreDeleteResponse, error) {
        log.G(ctx).WithField("req", req).Debug("delete sandbox")
        if err := s.store.Delete(ctx, req.SandboxID); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.StoreDeleteResponse{}, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package snapshots

import (
        "context"

        snapshotsapi "github.com/containerd/containerd/api/services/snapshots/v1"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "snapshots",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: newService,
        })
}

var empty = &ptypes.Empty{}

type service struct {
        ss map[string]snapshots.Snapshotter
        snapshotsapi.UnimplementedSnapshotsServer
}

func newService(ic *plugin.InitContext) (interface{}, error) {
        i, err := ic.GetByID(plugins.ServicePlugin, services.SnapshotsService)
        if err != nil {
                return nil, err
        }
        return &service{ss: i.(map[string]snapshots.Snapshotter)}, nil
}

func (s *service) getSnapshotter(name string) (snapshots.Snapshotter, error) {
        if name == "" {
                return nil, errdefs.ToGRPCf(errdefs.ErrInvalidArgument, "snapshotter argument missing")
        }

        sn := s.ss[name]
        if sn == nil {
                return nil, errdefs.ToGRPCf(errdefs.ErrInvalidArgument, "snapshotter not loaded: %s", name)
        }
        return sn, nil
}

func (s *service) Register(gs *grpc.Server) error {
        snapshotsapi.RegisterSnapshotsServer(gs, s)
        return nil
}

func (s *service) Prepare(ctx context.Context, pr *snapshotsapi.PrepareSnapshotRequest) (*snapshotsapi.PrepareSnapshotResponse, error) {
        log.G(ctx).WithField("parent", pr.Parent).WithField("key", pr.Key).Debugf("prepare snapshot")
        sn, err := s.getSnapshotter(pr.Snapshotter)
        if err != nil {
                return nil, err
        }

        var opts []snapshots.Opt
        if pr.Labels != nil {
                opts = append(opts, snapshots.WithLabels(pr.Labels))
        }
        mounts, err := sn.Prepare(ctx, pr.Key, pr.Parent, opts...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &snapshotsapi.PrepareSnapshotResponse{
                Mounts: mount.ToProto(mounts),
        }, nil
}

func (s *service) View(ctx context.Context, pr *snapshotsapi.ViewSnapshotRequest) (*snapshotsapi.ViewSnapshotResponse, error) {
        log.G(ctx).WithField("parent", pr.Parent).WithField("key", pr.Key).Debugf("prepare view snapshot")
        sn, err := s.getSnapshotter(pr.Snapshotter)
        if err != nil {
                return nil, err
        }
        var opts []snapshots.Opt
        if pr.Labels != nil {
                opts = append(opts, snapshots.WithLabels(pr.Labels))
        }
        mounts, err := sn.View(ctx, pr.Key, pr.Parent, opts...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &snapshotsapi.ViewSnapshotResponse{
                Mounts: mount.ToProto(mounts),
        }, nil
}

func (s *service) Mounts(ctx context.Context, mr *snapshotsapi.MountsRequest) (*snapshotsapi.MountsResponse, error) {
        log.G(ctx).WithField("key", mr.Key).Debugf("get snapshot mounts")
        sn, err := s.getSnapshotter(mr.Snapshotter)
        if err != nil {
                return nil, err
        }

        mounts, err := sn.Mounts(ctx, mr.Key)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &snapshotsapi.MountsResponse{
                Mounts: mount.ToProto(mounts),
        }, nil
}

func (s *service) Commit(ctx context.Context, cr *snapshotsapi.CommitSnapshotRequest) (*ptypes.Empty, error) {
        log.G(ctx).WithField("key", cr.Key).WithField("name", cr.Name).Debugf("commit snapshot")
        sn, err := s.getSnapshotter(cr.Snapshotter)
        if err != nil {
                return nil, err
        }

        var opts []snapshots.Opt
        if cr.Labels != nil {
                opts = append(opts, snapshots.WithLabels(cr.Labels))
        }
        if err := sn.Commit(ctx, cr.Name, cr.Key, opts...); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return empty, nil
}

func (s *service) Remove(ctx context.Context, rr *snapshotsapi.RemoveSnapshotRequest) (*ptypes.Empty, error) {
        log.G(ctx).WithField("key", rr.Key).Debugf("remove snapshot")
        sn, err := s.getSnapshotter(rr.Snapshotter)
        if err != nil {
                return nil, err
        }

        if err := sn.Remove(ctx, rr.Key); err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return empty, nil
}

func (s *service) Stat(ctx context.Context, sr *snapshotsapi.StatSnapshotRequest) (*snapshotsapi.StatSnapshotResponse, error) {
        log.G(ctx).WithField("key", sr.Key).Debugf("stat snapshot")
        sn, err := s.getSnapshotter(sr.Snapshotter)
        if err != nil {
                return nil, err
        }

        info, err := sn.Stat(ctx, sr.Key)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &snapshotsapi.StatSnapshotResponse{Info: snapshots.InfoToProto(info)}, nil
}

func (s *service) Update(ctx context.Context, sr *snapshotsapi.UpdateSnapshotRequest) (*snapshotsapi.UpdateSnapshotResponse, error) {
        log.G(ctx).WithField("key", sr.Info.Name).Debugf("update snapshot")
        sn, err := s.getSnapshotter(sr.Snapshotter)
        if err != nil {
                return nil, err
        }

        info, err := sn.Update(ctx, snapshots.InfoFromProto(sr.Info), sr.UpdateMask.GetPaths()...)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &snapshotsapi.UpdateSnapshotResponse{Info: snapshots.InfoToProto(info)}, nil
}

func (s *service) List(sr *snapshotsapi.ListSnapshotsRequest, ss snapshotsapi.Snapshots_ListServer) error {
        sn, err := s.getSnapshotter(sr.Snapshotter)
        if err != nil {
                return err
        }

        var (
                buffer    []*snapshotsapi.Info
                sendBlock = func(block []*snapshotsapi.Info) error {
                        return ss.Send(&snapshotsapi.ListSnapshotsResponse{
                                Info: block,
                        })
                }
        )
        err = sn.Walk(ss.Context(), func(ctx context.Context, info snapshots.Info) error {
                buffer = append(buffer, snapshots.InfoToProto(info))

                if len(buffer) >= 100 {
                        if err := sendBlock(buffer); err != nil {
                                return err
                        }

                        buffer = buffer[:0]
                }

                return nil
        }, sr.Filters...)
        if err != nil {
                return err
        }
        if len(buffer) > 0 {
                // Send remaining infos
                if err := sendBlock(buffer); err != nil {
                        return err
                }
        }

        return nil
}

func (s *service) Usage(ctx context.Context, ur *snapshotsapi.UsageRequest) (*snapshotsapi.UsageResponse, error) {
        sn, err := s.getSnapshotter(ur.Snapshotter)
        if err != nil {
                return nil, err
        }

        usage, err := sn.Usage(ctx, ur.Key)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return snapshots.UsageToProto(usage), nil
}

func (s *service) Cleanup(ctx context.Context, cr *snapshotsapi.CleanupRequest) (*ptypes.Empty, error) {
        sn, err := s.getSnapshotter(cr.Snapshotter)
        if err != nil {
                return nil, err
        }

        c, ok := sn.(snapshots.Cleaner)
        if !ok {
                return nil, errdefs.ToGRPCf(errdefs.ErrNotImplemented, "snapshotter does not implement Cleanup method")
        }

        err = c.Cleanup(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return empty, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package snapshots

import (
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.ServicePlugin,
                ID:   services.SnapshotsService,
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }

                        return m.(*metadata.DB).Snapshotters(), nil
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package streaming

import (
        "errors"
        "io"

        api "github.com/containerd/containerd/api/services/streaming/v1"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
        "google.golang.org/grpc"
)

var emptyResponse typeurl.Any

func init() {
        // save marshalled empty response to avoid marshaling everytime
        var err error
        emptyResponse, err = typeurl.MarshalAny(&ptypes.Empty{})
        if err != nil {
                panic(err)
        }

        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "streaming",
                Requires: []plugin.Type{
                        plugins.StreamingPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.StreamingPlugin, "manager")
                        if err != nil {
                                return nil, err
                        }
                        return &service{manager: i.(streaming.StreamManager)}, nil
                },
        })
}

type service struct {
        manager streaming.StreamManager
        api.UnimplementedStreamingServer
}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterStreamingServer(server, s)
        return nil
}

func (s *service) Stream(srv api.Streaming_StreamServer) error {
        // TODO: Timeout waiting
        a, err := srv.Recv()
        if err != nil {
                return err
        }
        var i api.StreamInit
        if err := typeurl.UnmarshalTo(a, &i); err != nil {
                return err
        }

        cc := make(chan struct{})
        ss := &serviceStream{
                s:  srv,
                cc: cc,
        }

        log.G(srv.Context()).WithField("stream", i.ID).Debug("registering stream")
        if err := s.manager.Register(srv.Context(), i.ID, ss); err != nil {
                return err
        }

        // Send response packet after registering stream
        if err := srv.Send(protobuf.FromAny(emptyResponse)); err != nil {
                return err
        }

        select {
        case <-srv.Context().Done():
                // TODO: Should return error if not cancelled?
        case <-cc:
        }

        return nil
}

type serviceStream struct {
        s  api.Streaming_StreamServer
        cc chan struct{}
}

func (ss *serviceStream) Send(a typeurl.Any) (err error) {
        err = errdefs.FromGRPC(ss.s.Send(protobuf.FromAny(a)))
        if !errors.Is(err, io.EOF) {
                err = errdefs.FromGRPC(err)
        }
        return
}

func (ss *serviceStream) Recv() (a typeurl.Any, err error) {
        a, err = ss.s.Recv()
        if !errors.Is(err, io.EOF) {
                err = errdefs.FromGRPC(err)
        }
        return
}

func (ss *serviceStream) Close() error {
        select {
        case <-ss.cc:
        default:
                close(ss.cc)
        }
        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package tasks

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "time"

        api "github.com/containerd/containerd/api/services/tasks/v1"
        "github.com/containerd/containerd/api/types"
        "github.com/containerd/containerd/api/types/runc/options"
        "github.com/containerd/containerd/api/types/task"
        "github.com/containerd/containerd/v2/core/containers"
        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/events"
        "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/runtime"
        "github.com/containerd/containerd/v2/pkg/archive"
        "github.com/containerd/containerd/v2/pkg/blockio"
        "github.com/containerd/containerd/v2/pkg/filters"
        "github.com/containerd/containerd/v2/pkg/protobuf"
        "github.com/containerd/containerd/v2/pkg/protobuf/proto"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/pkg/rdt"
        "github.com/containerd/containerd/v2/pkg/timeout"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "google.golang.org/grpc"
        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
)

var (
        _     = (api.TasksClient)(&local{})
        empty = &ptypes.Empty{}
)

const (
        stateTimeout = "io.containerd.timeout.task.state"
)

// Config for the tasks service plugin
type Config struct {
        // BlockIOConfigFile specifies the path to blockio configuration file
        BlockIOConfigFile string `toml:"blockio_config_file" json:"blockioConfigFile"`
        // RdtConfigFile specifies the path to RDT configuration file
        RdtConfigFile string `toml:"rdt_config_file" json:"rdtConfigFile"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type:     plugins.ServicePlugin,
                ID:       services.TasksService,
                Requires: tasksServiceRequires,
                Config:   &Config{},
                InitFn:   initFunc,
        })

        timeout.Set(stateTimeout, 2*time.Second)
}

func initFunc(ic *plugin.InitContext) (interface{}, error) {
        config := ic.Config.(*Config)

        v2r, err := ic.GetByID(plugins.RuntimePluginV2, "task")
        if err != nil {
                return nil, err
        }

        m, err := ic.GetSingle(plugins.MetadataPlugin)
        if err != nil {
                return nil, err
        }

        ep, err := ic.GetSingle(plugins.EventPlugin)
        if err != nil {
                return nil, err
        }

        monitor, err := ic.GetSingle(plugins.TaskMonitorPlugin)
        if err != nil {
                if !errors.Is(err, plugin.ErrPluginNotFound) {
                        return nil, err
                }
                monitor = runtime.NewNoopMonitor()
        }

        db := m.(*metadata.DB)
        l := &local{
                containers: metadata.NewContainerStore(db),
                store:      db.ContentStore(),
                publisher:  ep.(events.Publisher),
                monitor:    monitor.(runtime.TaskMonitor),
                v2Runtime:  v2r.(runtime.PlatformRuntime),
        }

        v2Tasks, err := l.v2Runtime.Tasks(ic.Context, true)
        if err != nil {
                return nil, err
        }
        for _, t := range v2Tasks {
                l.monitor.Monitor(t, nil)
        }

        if err := blockio.SetConfig(config.BlockIOConfigFile); err != nil {
                log.G(ic.Context).WithError(err).Errorf("blockio initialization failed")
        }
        if err := rdt.SetConfig(config.RdtConfigFile); err != nil {
                log.G(ic.Context).WithError(err).Errorf("RDT initialization failed")
        }

        return l, nil
}

type local struct {
        containers containers.Store
        store      content.Store
        publisher  events.Publisher

        monitor   runtime.TaskMonitor
        v2Runtime runtime.PlatformRuntime
}

func (l *local) Create(ctx context.Context, r *api.CreateTaskRequest, _ ...grpc.CallOption) (*api.CreateTaskResponse, error) {
        container, err := l.getContainer(ctx, r.ContainerID)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        var (
                checkpointPath string
                taskAPIAddress string
                taskAPIVersion uint32
        )

        if r.Options != nil {
                taskOptions, err := formatOptions(container.Runtime.Name, r.Options)
                if err != nil {
                        return nil, err
                }
                checkpointPath = taskOptions.CriuImagePath
                taskAPIAddress = taskOptions.TaskApiAddress
                taskAPIVersion = taskOptions.TaskApiVersion
        }

        // jump get checkpointPath from checkpoint image
        if checkpointPath == "" && r.Checkpoint != nil {
                checkpointPath, err = os.MkdirTemp(os.Getenv("XDG_RUNTIME_DIR"), "ctrd-checkpoint")
                if err != nil {
                        return nil, err
                }
                if r.Checkpoint.MediaType != images.MediaTypeContainerd1Checkpoint {
                        return nil, fmt.Errorf("unsupported checkpoint type %q", r.Checkpoint.MediaType)
                }
                reader, err := l.store.ReaderAt(ctx, ocispec.Descriptor{
                        MediaType:   r.Checkpoint.MediaType,
                        Digest:      digest.Digest(r.Checkpoint.Digest),
                        Size:        r.Checkpoint.Size,
                        Annotations: r.Checkpoint.Annotations,
                })
                if err != nil {
                        return nil, err
                }
                _, err = archive.Apply(ctx, checkpointPath, content.NewReader(reader))
                reader.Close()
                if err != nil {
                        return nil, err
                }
        }
        opts := runtime.CreateOpts{
                Spec: container.Spec,
                IO: runtime.IO{
                        Stdin:    r.Stdin,
                        Stdout:   r.Stdout,
                        Stderr:   r.Stderr,
                        Terminal: r.Terminal,
                },
                Checkpoint:     checkpointPath,
                Runtime:        container.Runtime.Name,
                RuntimeOptions: container.Runtime.Options,
                TaskOptions:    r.Options,
                SandboxID:      container.SandboxID,
                Address:        taskAPIAddress,
                Version:        taskAPIVersion,
        }
        if r.RuntimePath != "" {
                opts.Runtime = r.RuntimePath
        }
        for _, m := range r.Rootfs {
                opts.Rootfs = append(opts.Rootfs, mount.Mount{
                        Type:    m.Type,
                        Source:  m.Source,
                        Target:  m.Target,
                        Options: m.Options,
                })
        }

        rtime := l.v2Runtime

        _, err = rtime.Get(ctx, r.ContainerID)
        if err != nil && !errdefs.IsNotFound(err) {
                return nil, errdefs.ToGRPC(err)
        }
        if err == nil {
                return nil, errdefs.ToGRPC(fmt.Errorf("task %s: %w", r.ContainerID, errdefs.ErrAlreadyExists))
        }
        c, err := rtime.Create(ctx, r.ContainerID, opts)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        labels := map[string]string{"runtime": container.Runtime.Name}
        if err := l.monitor.Monitor(c, labels); err != nil {
                return nil, fmt.Errorf("monitor task: %w", err)
        }
        pid, err := c.PID(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to get task pid: %w", err)
        }
        return &api.CreateTaskResponse{
                ContainerID: r.ContainerID,
                Pid:         pid,
        }, nil
}

func (l *local) Start(ctx context.Context, r *api.StartRequest, _ ...grpc.CallOption) (*api.StartResponse, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        p := runtime.Process(t)
        if r.ExecID != "" {
                if p, err = t.Process(ctx, r.ExecID); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        if err := p.Start(ctx); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        state, err := p.State(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.StartResponse{
                Pid: state.Pid,
        }, nil
}

func (l *local) Delete(ctx context.Context, r *api.DeleteTaskRequest, _ ...grpc.CallOption) (*api.DeleteResponse, error) {
        container, err := l.getContainer(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }

        // Get task object
        t, err := l.v2Runtime.Get(ctx, container.ID)
        if err != nil {
                return nil, status.Errorf(codes.NotFound, "task %v not found", container.ID)
        }

        if err := l.monitor.Stop(t); err != nil {
                return nil, err
        }

        exit, err := l.v2Runtime.Delete(ctx, r.ContainerID)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        return &api.DeleteResponse{
                ExitStatus: exit.Status,
                ExitedAt:   protobuf.ToTimestamp(exit.Timestamp),
                Pid:        exit.Pid,
        }, nil
}

func (l *local) DeleteProcess(ctx context.Context, r *api.DeleteProcessRequest, _ ...grpc.CallOption) (*api.DeleteResponse, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        process, err := t.Process(ctx, r.ExecID)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        exit, err := process.Delete(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.DeleteResponse{
                ID:         r.ExecID,
                ExitStatus: exit.Status,
                ExitedAt:   protobuf.ToTimestamp(exit.Timestamp),
                Pid:        exit.Pid,
        }, nil
}

func getProcessState(ctx context.Context, p runtime.Process) (*task.Process, error) {
        ctx, cancel := timeout.WithContext(ctx, stateTimeout)
        defer cancel()

        state, err := p.State(ctx)
        if err != nil {
                if errdefs.IsNotFound(err) || errdefs.IsUnavailable(err) {
                        return nil, err
                }
                log.G(ctx).WithError(err).Errorf("get state for %s", p.ID())
        }
        status := task.Status_UNKNOWN
        switch state.Status {
        case runtime.CreatedStatus:
                status = task.Status_CREATED
        case runtime.RunningStatus:
                status = task.Status_RUNNING
        case runtime.StoppedStatus:
                status = task.Status_STOPPED
        case runtime.PausedStatus:
                status = task.Status_PAUSED
        case runtime.PausingStatus:
                status = task.Status_PAUSING
        default:
                log.G(ctx).WithField("status", state.Status).Warn("unknown status")
        }
        return &task.Process{
                ID:         p.ID(),
                Pid:        state.Pid,
                Status:     status,
                Stdin:      state.Stdin,
                Stdout:     state.Stdout,
                Stderr:     state.Stderr,
                Terminal:   state.Terminal,
                ExitStatus: state.ExitStatus,
                ExitedAt:   protobuf.ToTimestamp(state.ExitedAt),
        }, nil
}

func (l *local) Get(ctx context.Context, r *api.GetRequest, _ ...grpc.CallOption) (*api.GetResponse, error) {
        task, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        p := runtime.Process(task)
        if r.ExecID != "" {
                if p, err = task.Process(ctx, r.ExecID); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        t, err := getProcessState(ctx, p)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.GetResponse{
                Process: t,
        }, nil
}

func (l *local) List(ctx context.Context, r *api.ListTasksRequest, _ ...grpc.CallOption) (*api.ListTasksResponse, error) {
        resp := &api.ListTasksResponse{}
        tasks, err := l.v2Runtime.Tasks(ctx, false)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        addTasks(ctx, resp, tasks)
        return resp, nil
}

func addTasks(ctx context.Context, r *api.ListTasksResponse, tasks []runtime.Task) {
        for _, t := range tasks {
                tt, err := getProcessState(ctx, t)
                if err != nil {
                        if !errdefs.IsNotFound(err) { // handle race with deletion
                                log.G(ctx).WithError(err).WithField("id", t.ID()).Error("converting task to protobuf")
                        }
                        continue
                }
                r.Tasks = append(r.Tasks, tt)
        }
}

func (l *local) Pause(ctx context.Context, r *api.PauseTaskRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        err = t.Pause(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return empty, nil
}

func (l *local) Resume(ctx context.Context, r *api.ResumeTaskRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        err = t.Resume(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return empty, nil
}

func (l *local) Kill(ctx context.Context, r *api.KillRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        p := runtime.Process(t)
        if r.ExecID != "" {
                if p, err = t.Process(ctx, r.ExecID); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        if err := p.Kill(ctx, r.Signal, r.All); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return empty, nil
}

func (l *local) ListPids(ctx context.Context, r *api.ListPidsRequest, _ ...grpc.CallOption) (*api.ListPidsResponse, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        processList, err := t.Pids(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        var processes []*task.ProcessInfo
        for _, p := range processList {
                pInfo := task.ProcessInfo{
                        Pid: p.Pid,
                }
                if p.Info != nil {
                        a, err := protobuf.MarshalAnyToProto(p.Info)
                        if err != nil {
                                return nil, fmt.Errorf("failed to marshal process %d info: %w", p.Pid, err)
                        }
                        pInfo.Info = a
                }
                processes = append(processes, &pInfo)
        }
        return &api.ListPidsResponse{
                Processes: processes,
        }, nil
}

func (l *local) Exec(ctx context.Context, r *api.ExecProcessRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        if r.ExecID == "" {
                return nil, status.Errorf(codes.InvalidArgument, "exec id cannot be empty")
        }
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        if _, err := t.Exec(ctx, r.ExecID, runtime.ExecOpts{
                Spec: r.Spec,
                IO: runtime.IO{
                        Stdin:    r.Stdin,
                        Stdout:   r.Stdout,
                        Stderr:   r.Stderr,
                        Terminal: r.Terminal,
                },
        }); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return empty, nil
}

func (l *local) ResizePty(ctx context.Context, r *api.ResizePtyRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        p := runtime.Process(t)
        if r.ExecID != "" {
                if p, err = t.Process(ctx, r.ExecID); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        if err := p.ResizePty(ctx, runtime.ConsoleSize{
                Width:  r.Width,
                Height: r.Height,
        }); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return empty, nil
}

func (l *local) CloseIO(ctx context.Context, r *api.CloseIORequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        p := runtime.Process(t)
        if r.ExecID != "" {
                if p, err = t.Process(ctx, r.ExecID); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        if r.Stdin {
                if err := p.CloseIO(ctx); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        return empty, nil
}

func (l *local) Checkpoint(ctx context.Context, r *api.CheckpointTaskRequest, _ ...grpc.CallOption) (*api.CheckpointTaskResponse, error) {
        container, err := l.getContainer(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        t, err := l.getTaskFromContainer(ctx, container)
        if err != nil {
                return nil, err
        }
        image, err := getCheckpointPath(container.Runtime.Name, r.Options)
        if err != nil {
                return nil, err
        }
        checkpointImageExists := false
        if image == "" {
                checkpointImageExists = true
                image, err = os.MkdirTemp(os.Getenv("XDG_RUNTIME_DIR"), "ctrd-checkpoint")
                if err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
                defer os.RemoveAll(image)
        }
        if err := t.Checkpoint(ctx, image, r.Options); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        // do not commit checkpoint image if checkpoint ImagePath is passed,
        // return if checkpointImageExists is false
        if !checkpointImageExists {
                return &api.CheckpointTaskResponse{}, nil
        }
        // write checkpoint to the content store
        tar := archive.Diff(ctx, "", image)
        cp, err := l.writeContent(ctx, images.MediaTypeContainerd1Checkpoint, image, tar)
        // close tar first after write
        if err := tar.Close(); err != nil {
                return nil, err
        }
        if err != nil {
                return nil, err
        }
        // write the config to the content store
        pbany := protobuf.FromAny(container.Spec)
        data, err := proto.Marshal(pbany)
        if err != nil {
                return nil, err
        }
        spec := bytes.NewReader(data)
        specD, err := l.writeContent(ctx, images.MediaTypeContainerd1CheckpointConfig, filepath.Join(image, "spec"), spec)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.CheckpointTaskResponse{
                Descriptors: []*types.Descriptor{
                        cp,
                        specD,
                },
        }, nil
}

func (l *local) Update(ctx context.Context, r *api.UpdateTaskRequest, _ ...grpc.CallOption) (*ptypes.Empty, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        if err := t.Update(ctx, r.Resources, r.Annotations); err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return empty, nil
}

func (l *local) Metrics(ctx context.Context, r *api.MetricsRequest, _ ...grpc.CallOption) (*api.MetricsResponse, error) {
        filter, err := filters.ParseAll(r.Filters...)
        if err != nil {
                return nil, err
        }
        var resp api.MetricsResponse
        tasks, err := l.v2Runtime.Tasks(ctx, false)
        if err != nil {
                return nil, err
        }
        getTasksMetrics(ctx, filter, tasks, &resp)
        return &resp, nil
}

func (l *local) Wait(ctx context.Context, r *api.WaitRequest, _ ...grpc.CallOption) (*api.WaitResponse, error) {
        t, err := l.getTask(ctx, r.ContainerID)
        if err != nil {
                return nil, err
        }
        p := runtime.Process(t)
        if r.ExecID != "" {
                if p, err = t.Process(ctx, r.ExecID); err != nil {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        exit, err := p.Wait(ctx)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &api.WaitResponse{
                ExitStatus: exit.Status,
                ExitedAt:   protobuf.ToTimestamp(exit.Timestamp),
        }, nil
}

func getTasksMetrics(ctx context.Context, filter filters.Filter, tasks []runtime.Task, r *api.MetricsResponse) {
        for _, tk := range tasks {
                if !filter.Match(filters.AdapterFunc(func(fieldpath []string) (string, bool) {
                        t := tk
                        switch fieldpath[0] {
                        case "id":
                                return t.ID(), true
                        case "namespace":
                                return t.Namespace(), true
                        case "runtime":
                                // return t.Info().Runtime, true
                        }
                        return "", false
                })) {
                        continue
                }
                collected := time.Now()
                stats, err := tk.Stats(ctx)
                if err != nil {
                        if !errdefs.IsNotFound(err) {
                                log.G(ctx).WithError(err).Errorf("collecting metrics for %s", tk.ID())
                        }
                        continue
                }
                r.Metrics = append(r.Metrics, &types.Metric{
                        Timestamp: protobuf.ToTimestamp(collected),
                        ID:        tk.ID(),
                        Data:      stats,
                })
        }
}

func (l *local) writeContent(ctx context.Context, mediaType, ref string, r io.Reader) (*types.Descriptor, error) {
        writer, err := l.store.Writer(ctx, content.WithRef(ref), content.WithDescriptor(ocispec.Descriptor{MediaType: mediaType}))
        if err != nil {
                return nil, err
        }
        defer writer.Close()
        size, err := io.Copy(writer, r)
        if err != nil {
                return nil, err
        }
        if err := writer.Commit(ctx, 0, ""); err != nil && !errdefs.IsAlreadyExists(err) {
                return nil, err
        }
        return &types.Descriptor{
                MediaType:   mediaType,
                Digest:      writer.Digest().String(),
                Size:        size,
                Annotations: make(map[string]string),
        }, nil
}

func (l *local) getContainer(ctx context.Context, id string) (*containers.Container, error) {
        var container containers.Container
        container, err := l.containers.Get(ctx, id)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        return &container, nil
}

func (l *local) getTask(ctx context.Context, id string) (runtime.Task, error) {
        container, err := l.getContainer(ctx, id)
        if err != nil {
                return nil, err
        }
        return l.getTaskFromContainer(ctx, container)
}

func (l *local) getTaskFromContainer(ctx context.Context, container *containers.Container) (runtime.Task, error) {
        t, err := l.v2Runtime.Get(ctx, container.ID)
        if err != nil {
                return nil, status.Errorf(codes.NotFound, "task %v not found", container.ID)
        }
        return t, nil
}

// getCheckpointPath only suitable for runc runtime now
func getCheckpointPath(runtime string, option *ptypes.Any) (string, error) {
        if option == nil {
                return "", nil
        }

        var checkpointPath string
        v, err := typeurl.UnmarshalAny(option)
        if err != nil {
                return "", err
        }
        opts, ok := v.(*options.CheckpointOptions)
        if !ok {
                return "", fmt.Errorf("invalid task checkpoint option for %s", runtime)
        }
        checkpointPath = opts.ImagePath

        return checkpointPath, nil
}

func formatOptions(runtime string, option *ptypes.Any) (*options.Options, error) {
        v, err := typeurl.UnmarshalAny(option)
        if err != nil {
                return nil, err
        }
        opts, ok := v.(*options.Options)
        if !ok {
                return nil, fmt.Errorf("invalid task create option for %s", runtime)
        }
        return opts, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package tasks

import (
        "context"

        api "github.com/containerd/containerd/api/services/tasks/v1"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/services"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

var (
        _ = (api.TasksServer)(&service{})
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "tasks",
                Requires: []plugin.Type{
                        plugins.ServicePlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        i, err := ic.GetByID(plugins.ServicePlugin, services.TasksService)
                        if err != nil {
                                return nil, err
                        }
                        return &service{local: i.(api.TasksClient)}, nil
                },
        })
}

type service struct {
        local api.TasksClient
        api.UnimplementedTasksServer
}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterTasksServer(server, s)
        return nil
}

func (s *service) Create(ctx context.Context, r *api.CreateTaskRequest) (*api.CreateTaskResponse, error) {
        return s.local.Create(ctx, r)
}

func (s *service) Start(ctx context.Context, r *api.StartRequest) (*api.StartResponse, error) {
        return s.local.Start(ctx, r)
}

func (s *service) Delete(ctx context.Context, r *api.DeleteTaskRequest) (*api.DeleteResponse, error) {
        return s.local.Delete(ctx, r)
}

func (s *service) DeleteProcess(ctx context.Context, r *api.DeleteProcessRequest) (*api.DeleteResponse, error) {
        return s.local.DeleteProcess(ctx, r)
}

func (s *service) Get(ctx context.Context, r *api.GetRequest) (*api.GetResponse, error) {
        return s.local.Get(ctx, r)
}

func (s *service) List(ctx context.Context, r *api.ListTasksRequest) (*api.ListTasksResponse, error) {
        return s.local.List(ctx, r)
}

func (s *service) Pause(ctx context.Context, r *api.PauseTaskRequest) (*ptypes.Empty, error) {
        return s.local.Pause(ctx, r)
}

func (s *service) Resume(ctx context.Context, r *api.ResumeTaskRequest) (*ptypes.Empty, error) {
        return s.local.Resume(ctx, r)
}

func (s *service) Kill(ctx context.Context, r *api.KillRequest) (*ptypes.Empty, error) {
        return s.local.Kill(ctx, r)
}

func (s *service) ListPids(ctx context.Context, r *api.ListPidsRequest) (*api.ListPidsResponse, error) {
        return s.local.ListPids(ctx, r)
}

func (s *service) Exec(ctx context.Context, r *api.ExecProcessRequest) (*ptypes.Empty, error) {
        return s.local.Exec(ctx, r)
}

func (s *service) ResizePty(ctx context.Context, r *api.ResizePtyRequest) (*ptypes.Empty, error) {
        return s.local.ResizePty(ctx, r)
}

func (s *service) CloseIO(ctx context.Context, r *api.CloseIORequest) (*ptypes.Empty, error) {
        return s.local.CloseIO(ctx, r)
}

func (s *service) Checkpoint(ctx context.Context, r *api.CheckpointTaskRequest) (*api.CheckpointTaskResponse, error) {
        return s.local.Checkpoint(ctx, r)
}

func (s *service) Update(ctx context.Context, r *api.UpdateTaskRequest) (*ptypes.Empty, error) {
        return s.local.Update(ctx, r)
}

func (s *service) Metrics(ctx context.Context, r *api.MetricsRequest) (*api.MetricsResponse, error) {
        return s.local.Metrics(ctx, r)
}

func (s *service) Wait(ctx context.Context, r *api.WaitRequest) (*api.WaitResponse, error) {
        return s.local.Wait(ctx, r)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package transfer

import (
        "context"

        transferapi "github.com/containerd/containerd/api/services/transfer/v1"
        "github.com/containerd/containerd/api/types"
        transferTypes "github.com/containerd/containerd/api/types/transfer"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/core/transfer"
        tplugins "github.com/containerd/containerd/v2/core/transfer/plugins"
        "github.com/containerd/containerd/v2/pkg/oci"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "github.com/containerd/typeurl/v2"
        "google.golang.org/grpc"
        "google.golang.org/grpc/codes"
        "google.golang.org/grpc/status"
        "google.golang.org/protobuf/types/known/emptypb"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.GRPCPlugin,
                ID:   "transfer",
                Requires: []plugin.Type{
                        plugins.TransferPlugin,
                        plugins.StreamingPlugin,
                },
                InitFn: newService,
        })
}

type service struct {
        transferrers  []transfer.Transferrer
        streamManager streaming.StreamManager
        transferapi.UnimplementedTransferServer
}

func newService(ic *plugin.InitContext) (interface{}, error) {
        sps, err := ic.GetByType(plugins.TransferPlugin)
        if err != nil {
                return nil, err
        }

        // TODO: how to determine order?
        t := make([]transfer.Transferrer, 0, len(sps))
        for _, p := range sps {
                t = append(t, p.(transfer.Transferrer))
        }
        sp, err := ic.GetByID(plugins.StreamingPlugin, "manager")
        if err != nil {
                return nil, err
        }
        return &service{
                transferrers:  t,
                streamManager: sp.(streaming.StreamManager),
        }, nil
}

func (s *service) Register(gs *grpc.Server) error {
        transferapi.RegisterTransferServer(gs, s)
        return nil
}

func (s *service) Transfer(ctx context.Context, req *transferapi.TransferRequest) (*emptypb.Empty, error) {
        var transferOpts []transfer.Opt
        if req.Options != nil {
                if req.Options.ProgressStream != "" {
                        stream, err := s.streamManager.Get(ctx, req.Options.ProgressStream)
                        if err != nil {
                                return nil, errdefs.ToGRPC(err)
                        }
                        defer stream.Close()

                        pf := func(p transfer.Progress) {
                                var descp *types.Descriptor
                                if p.Desc != nil {
                                        descp = oci.DescriptorToProto(*p.Desc)
                                }
                                progress, err := typeurl.MarshalAny(&transferTypes.Progress{
                                        Event:    p.Event,
                                        Name:     p.Name,
                                        Parents:  p.Parents,
                                        Progress: p.Progress,
                                        Total:    p.Total,
                                        Desc:     descp,
                                })
                                if err != nil {
                                        log.G(ctx).WithError(err).Warnf("event could not be marshaled: %v/%v", p.Event, p.Name)
                                        return
                                }
                                if err := stream.Send(progress); err != nil {
                                        log.G(ctx).WithError(err).Warnf("event not sent: %v/%v", p.Event, p.Name)
                                        return
                                }
                        }

                        transferOpts = append(transferOpts, transfer.WithProgress(pf))
                }
        }
        src, err := s.convertAny(ctx, req.Source)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }
        dst, err := s.convertAny(ctx, req.Destination)
        if err != nil {
                return nil, errdefs.ToGRPC(err)
        }

        for _, t := range s.transferrers {
                if err := t.Transfer(ctx, src, dst, transferOpts...); err == nil {
                        return &ptypes.Empty{}, nil
                } else if !errdefs.IsNotImplemented(err) {
                        return nil, errdefs.ToGRPC(err)
                }
        }
        return nil, status.Errorf(codes.Unimplemented, "method Transfer not implemented for %s to %s", req.Source.GetTypeUrl(), req.Destination.GetTypeUrl())
}

func (s *service) convertAny(ctx context.Context, a typeurl.Any) (interface{}, error) {
        obj, err := tplugins.ResolveType(a)
        if err != nil {
                if errdefs.IsNotFound(err) {
                        return typeurl.UnmarshalAny(a)
                }
                return nil, err
        }
        switch v := obj.(type) {
        case streamUnmarshaler:
                err = v.UnmarshalAny(ctx, s.streamManager, a)
                return obj, err
        default:
                log.G(ctx).Debug("unmarshling to..")
                err = typeurl.UnmarshalTo(a, obj)
                return obj, err
        }
}

type streamUnmarshaler interface {
        UnmarshalAny(context.Context, streaming.StreamGetter, typeurl.Any) error
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package version

import (
        "context"

        api "github.com/containerd/containerd/api/services/version/v1"
        ptypes "github.com/containerd/containerd/v2/pkg/protobuf/types"
        "github.com/containerd/containerd/v2/plugins"
        ctrdversion "github.com/containerd/containerd/v2/version"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
        "google.golang.org/grpc"
)

var _ api.VersionServer = &service{}

func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.GRPCPlugin,
                ID:     "version",
                InitFn: initFunc,
        })
}

func initFunc(ic *plugin.InitContext) (interface{}, error) {
        return &service{}, nil
}

type service struct {
        api.UnimplementedVersionServer
}

func (s *service) Register(server *grpc.Server) error {
        api.RegisterVersionServer(server, s)
        return nil
}

func (s *service) Version(ctx context.Context, _ *ptypes.Empty) (*api.VersionResponse, error) {
        return &api.VersionResponse{
                Version:  ctrdversion.Version,
                Revision: ctrdversion.Revision,
        }, nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package warning

import (
        "context"
        "sync"
        "time"

        "github.com/containerd/log"

        deprecation "github.com/containerd/containerd/v2/pkg/deprecation"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

type Service interface {
        Emit(context.Context, deprecation.Warning)
        Warnings() []Warning
}

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.WarningPlugin,
                ID:   plugins.DeprecationsPlugin,
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        return &service{warnings: make(map[deprecation.Warning]time.Time)}, nil
                },
        })
}

type Warning struct {
        ID             deprecation.Warning
        LastOccurrence time.Time
        Message        string
}

var _ Service = (*service)(nil)

type service struct {
        warnings map[deprecation.Warning]time.Time
        m        sync.RWMutex
}

func (s *service) Emit(ctx context.Context, warning deprecation.Warning) {
        if !deprecation.Valid(warning) {
                log.G(ctx).WithField("warningID", string(warning)).Warn("invalid deprecation warning")
                return
        }
        s.m.Lock()
        defer s.m.Unlock()
        s.warnings[warning] = time.Now()
}
func (s *service) Warnings() []Warning {
        s.m.RLock()
        defer s.m.RUnlock()
        var warnings []Warning
        for k, v := range s.warnings {
                msg, ok := deprecation.Message(k)
                if !ok {
                        continue
                }
                warnings = append(warnings, Warning{
                        ID:             k,
                        LastOccurrence: v,
                        Message:        msg,
                })
        }
        return warnings
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package blockfile

import (
        "context"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "runtime"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/core/snapshots/storage"
        "github.com/containerd/continuity/fs"
        "github.com/containerd/log"
        "github.com/containerd/plugin"
)

// viewHookHelper is only used in test for recover the filesystem.
type viewHookHelper func(backingFile string, fsType string, defaultOpts []string) error

// SnapshotterConfig holds the configurable properties for the blockfile snapshotter
type SnapshotterConfig struct {
        // recreateScratch is whether scratch should be recreated even
        // if already exists
        recreateScratch bool

        scratchGenerator func(string) error

        // fsType is the filesystem type for the mount (defaults to ext4)
        fsType string

        // mountOptions are the base options added to the mount (defaults to ["loop"])
        mountOptions []string

        // testViewHookHelper is used to fsck or mount with rw to handle
        // the recovery. If we mount ro for view snapshot, we might hit
        // the issue like
        //
        //  (ext4) INFO: recovery required on readonly filesystem
        //  (ext4) write access unavailable, cannot proceed (try mounting with noload)
        //
        // FIXME(fuweid): I don't hit the readonly issue in ssd storage. But it's
        // easy to reproduce it in slow-storage.
        testViewHookHelper viewHookHelper
}

// Opt is an option to configure the overlay snapshotter
type Opt func(string, *SnapshotterConfig)

// WithScratchFile provides a scratch file which will get copied on startup
// if the scratch file needs to be generated.
func WithScratchFile(src string) Opt {
        return func(root string, config *SnapshotterConfig) {
                config.scratchGenerator = func(dst string) error {
                        // Copy src to dst
                        if err := copyFileWithSync(dst, src); err != nil {
                                return fmt.Errorf("failed to copy scratch: %w", err)
                        }
                        return nil
                }
        }
}

// WithFSType defines the filesystem type to apply to mounts of the blockfile
func WithFSType(fsType string) Opt {
        return func(root string, config *SnapshotterConfig) {
                config.fsType = fsType
        }
}

// WithMountOptions defines the mount options used for the mount
func WithMountOptions(options []string) Opt {
        return func(root string, config *SnapshotterConfig) {
                config.mountOptions = options
        }

}

// WithRecreateScratch is used to determine that scratch should be recreated
// even if already exists.
func WithRecreateScratch(recreate bool) Opt {
        return func(root string, config *SnapshotterConfig) {
                config.recreateScratch = recreate
        }
}

// withViewHookHelper introduces hook for preparing snapshot for View. It
// should be used in test only.
//
//nolint:nolintlint,unused // not used on all platforms
func withViewHookHelper(fn viewHookHelper) Opt {
        return func(_ string, config *SnapshotterConfig) {
                config.testViewHookHelper = fn
        }
}

type snapshotter struct {
        root    string
        scratch string
        fsType  string
        options []string
        ms      *storage.MetaStore

        testViewHookHelper viewHookHelper
}

// NewSnapshotter returns a Snapshotter which copies layers on the underlying
// file system. A metadata file is stored under the root.
func NewSnapshotter(root string, opts ...Opt) (snapshots.Snapshotter, error) {
        var config SnapshotterConfig
        if err := os.MkdirAll(root, 0700); err != nil {
                return nil, err
        }

        for _, opt := range opts {
                opt(root, &config)
        }

        scratch := filepath.Join(root, "scratch")
        createScratch := config.recreateScratch
        if !createScratch {
                if _, err := os.Stat(scratch); err != nil {
                        if !os.IsNotExist(err) {
                                return nil, fmt.Errorf("unable to stat scratch file: %w", err)
                        }
                        createScratch = true
                }
        }
        if createScratch {
                if config.scratchGenerator == nil {
                        return nil, fmt.Errorf("no scratch file generator: %w", plugin.ErrSkipPlugin)
                }
                if err := config.scratchGenerator(scratch); err != nil {
                        return nil, fmt.Errorf("failed to generate scratch file: %w", err)
                }
        }

        if config.fsType == "" {
                config.fsType = "ext4"
        }

        if config.mountOptions == nil {
                config.mountOptions = []string{"loop"}
        }

        ms, err := storage.NewMetaStore(filepath.Join(root, "metadata.db"))
        if err != nil {
                return nil, err
        }

        if err := os.Mkdir(filepath.Join(root, "snapshots"), 0700); err != nil && !os.IsExist(err) {
                return nil, err
        }

        return &snapshotter{
                root:    root,
                scratch: scratch,
                fsType:  config.fsType,
                options: config.mountOptions,
                ms:      ms,

                testViewHookHelper: config.testViewHookHelper,
        }, nil
}

// Stat returns the info for an active or committed snapshot by name or
// key.
//
// Should be used for parent resolution, existence checks and to discern
// the kind of snapshot.
func (o *snapshotter) Stat(ctx context.Context, key string) (info snapshots.Info, err error) {
        err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                _, info, _, err = storage.GetInfo(ctx, key)
                return err
        })
        if err != nil {
                return snapshots.Info{}, err
        }

        return info, nil
}

func (o *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (_ snapshots.Info, err error) {
        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                info, err = storage.UpdateInfo(ctx, info, fieldpaths...)
                return err
        })
        if err != nil {
                return snapshots.Info{}, err
        }

        return info, nil
}

func (o *snapshotter) Usage(ctx context.Context, key string) (usage snapshots.Usage, err error) {
        var (
                id   string
                info snapshots.Info
        )

        err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                id, info, usage, err = storage.GetInfo(ctx, key)
                if err != nil {
                        return err
                }

                // Current usage calculation is an approximation based on the size
                // of the block file - the size of its parent. This does not consider
                // that the filesystem may not support shared extents between the block
                // file and its parents, in which case the accurate calculation would just
                // be the size of the block file. Additionally, this does not take into
                // consideration that file may have been removed before being adding,
                // making the number of shared extents between the parent and the block
                // file smaller than the parent, under reporting actual usage.
                //
                // A more ideal calculation would look like:
                //  size(block) - usage(extent_intersection(block,parent))
                // OR
                //  usage(extent_union(block,parent)) - size(parent)

                if info.Kind == snapshots.KindActive {
                        // TODO: Use size calculator from fs package
                        st, err := os.Stat(o.getBlockFile(id))
                        if err != nil {
                                return err
                        }
                        usage.Size = st.Size()
                        usage.Inodes = 1
                }

                if info.Parent != "" {
                        // GetInfo returns total number of bytes used by a snapshot (including parent).
                        // So subtract parent usage in order to get delta consumed by layer itself.
                        _, _, parentUsage, err := storage.GetInfo(ctx, info.Parent)
                        if err != nil {
                                return err
                        }

                        usage.Size -= parentUsage.Size
                }

                return err
        })
        if err != nil {
                return snapshots.Usage{}, err
        }

        return usage, nil
}

func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
}

func (o *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return o.createSnapshot(ctx, snapshots.KindView, key, parent, opts)
}

// Mounts returns the mounts for the transaction identified by key. Can be
// called on an read-write or readonly transaction.
//
// This can be used to recover mounts after calling View or Prepare.
func (o *snapshotter) Mounts(ctx context.Context, key string) (_ []mount.Mount, err error) {
        var s storage.Snapshot
        err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                s, err = storage.GetSnapshot(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to get snapshot mount: %w", err)
                }

                return nil
        })
        if err != nil {
                return nil, err
        }

        return o.mounts(s), nil
}

func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
        return o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                id, _, _, err := storage.GetInfo(ctx, key)
                if err != nil {
                        return err
                }

                st, err := os.Stat(o.getBlockFile(id))
                if err != nil {
                        return err
                }

                usage := snapshots.Usage{
                        Size:   st.Size(),
                        Inodes: 1,
                }

                if _, err = storage.CommitActive(ctx, key, name, usage, opts...); err != nil {
                        return fmt.Errorf("failed to commit snapshot: %w", err)
                }
                return nil
        })
}

// Remove abandons the transaction identified by key. All resources
// associated with the key will be removed.
func (o *snapshotter) Remove(ctx context.Context, key string) (err error) {
        var (
                renamed, path string
                restore       bool
        )

        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                id, _, err := storage.Remove(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to remove: %w", err)
                }

                path = o.getBlockFile(id)
                renamed = filepath.Join(o.root, "snapshots", "rm-"+id)
                if err = os.Rename(path, renamed); err != nil {
                        if !os.IsNotExist(err) {
                                return fmt.Errorf("failed to rename: %w", err)
                        }
                        renamed = ""
                }

                restore = true
                return nil
        })

        if err != nil {
                if renamed != "" && restore {
                        if err1 := os.Rename(renamed, path); err1 != nil {
                                // May cause inconsistent data on disk
                                log.G(ctx).WithError(err1).WithField("path", renamed).Error("failed to rename after failed commit")
                        }
                }
                return err
        }
        if renamed != "" {
                if err := os.Remove(renamed); err != nil {
                        // Must be cleaned up, any "rm-*" could be removed if no active transactions
                        log.G(ctx).WithError(err).WithField("path", renamed).Warnf("failed to remove root filesystem")
                }
        }

        return nil
}

// Walk the committed snapshots.
func (o *snapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
        return o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                return storage.WalkInfo(ctx, fn, fs...)
        })
}

func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) (_ []mount.Mount, err error) {
        var s storage.Snapshot

        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                s, err = storage.CreateSnapshot(ctx, kind, key, parent, opts...)
                if err != nil {
                        return fmt.Errorf("failed to create snapshot: %w", err)
                }

                var path string
                if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive {
                        path = o.getBlockFile(s.ID)

                        if len(s.ParentIDs) > 0 {
                                if err = copyFileWithSync(path, o.getBlockFile(s.ParentIDs[0])); err != nil {
                                        return fmt.Errorf("copying of parent failed: %w", err)
                                }
                        } else {
                                if err = copyFileWithSync(path, o.scratch); err != nil {
                                        return fmt.Errorf("copying of scratch failed: %w", err)
                                }
                        }
                } else {
                        path = o.getBlockFile(s.ParentIDs[0])
                }

                if o.testViewHookHelper != nil {
                        if err := o.testViewHookHelper(path, o.fsType, o.options); err != nil {
                                return fmt.Errorf("failed to handle the viewHookHelper: %w", err)
                        }
                }

                return nil
        })
        if err != nil {
                return nil, err
        }

        return o.mounts(s), nil
}

func (o *snapshotter) getBlockFile(id string) string {
        return filepath.Join(o.root, "snapshots", id)
}

func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount {
        var (
                mountOptions = o.options
                source       string
        )

        if s.Kind == snapshots.KindView {
                mountOptions = append(mountOptions, "ro")
        } else {
                mountOptions = append(mountOptions, "rw")
        }

        if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive {
                source = o.getBlockFile(s.ID)
        } else {
                source = o.getBlockFile(s.ParentIDs[0])
        }

        return []mount.Mount{
                {
                        Source:  source,
                        Type:    o.fsType,
                        Options: mountOptions,
                },
        }
}

// Close closes the snapshotter
func (o *snapshotter) Close() error {
        return o.ms.Close()
}

func copyFileWithSync(target, source string) error {
        // The Go stdlib does not seem to have an efficient os.File.ReadFrom
        // routine for other platforms like it does on Linux with
        // copy_file_range. For Darwin at least we can use clonefile
        // in its place, otherwise if we have a sparse file we'd have
        // a fun surprise waiting below.
        //
        // TODO: Enlighten other platforms (windows?)
        if runtime.GOOS == "darwin" {
                return fs.CopyFile(target, source)
        }

        src, err := os.Open(source)
        if err != nil {
                return fmt.Errorf("failed to open source %s: %w", source, err)
        }
        defer src.Close()
        tgt, err := os.Create(target)
        if err != nil {
                return fmt.Errorf("failed to open target %s: %w", target, err)
        }
        defer tgt.Close()
        defer tgt.Sync()

        _, err = io.Copy(tgt, src)
        return err
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package plugin

import (
        "errors"

        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/snapshots/blockfile"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

// Config represents configuration for the native plugin.
type Config struct {
        // Root directory for the plugin
        RootPath string `toml:"root_path"`

        // ScratchFile is the scratch block file to use as an empty block
        ScratchFile string `toml:"scratch_file"`

        // FSType is the filesystem type for the mount
        FSType string `toml:"fs_type"`

        // MountOptions are options used for the mount
        MountOptions []string `toml:"mount_options"`

        // RecreateScratch always recreates the specified `ScratchFile`
        // on initialization of the plugin instead of using an existing.
        RecreateScratch bool `toml:"recreate_scratch"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.SnapshotPlugin,
                ID:     "blockfile",
                Config: &Config{},
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        ic.Meta.Platforms = append(ic.Meta.Platforms, platforms.DefaultSpec())

                        config, ok := ic.Config.(*Config)
                        if !ok {
                                return nil, errors.New("invalid blockfile configuration")
                        }

                        var opts []blockfile.Opt
                        root := ic.Properties[plugins.PropertyRootDir]
                        if len(config.RootPath) != 0 {
                                root = config.RootPath
                        }
                        if config.ScratchFile != "" {
                                opts = append(opts, blockfile.WithScratchFile(config.ScratchFile))
                        }
                        if config.FSType != "" {
                                opts = append(opts, blockfile.WithFSType(config.FSType))
                        }
                        if len(config.MountOptions) > 0 {
                                opts = append(opts, blockfile.WithMountOptions(config.MountOptions))
                        }
                        opts = append(opts, blockfile.WithRecreateScratch(config.RecreateScratch))

                        ic.Meta.Exports[plugins.SnapshotterRootDir] = root
                        return blockfile.NewSnapshotter(root, opts...)
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package native

import (
        "context"
        "fmt"
        "os"
        "path/filepath"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/core/snapshots/storage"
        "github.com/containerd/log"

        "github.com/containerd/continuity/fs"
)

type snapshotter struct {
        root string
        ms   *storage.MetaStore
}

// NewSnapshotter returns a Snapshotter which copies layers on the underlying
// file system. A metadata file is stored under the root.
func NewSnapshotter(root string) (snapshots.Snapshotter, error) {
        if err := os.MkdirAll(root, 0700); err != nil {
                return nil, err
        }
        ms, err := storage.NewMetaStore(filepath.Join(root, "metadata.db"))
        if err != nil {
                return nil, err
        }

        if err := os.Mkdir(filepath.Join(root, "snapshots"), 0700); err != nil && !os.IsExist(err) {
                return nil, err
        }

        return &snapshotter{
                root: root,
                ms:   ms,
        }, nil
}

// Stat returns the info for an active or committed snapshot by name or
// key.
//
// Should be used for parent resolution, existence checks and to discern
// the kind of snapshot.
func (o *snapshotter) Stat(ctx context.Context, key string) (info snapshots.Info, err error) {
        err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                _, info, _, err = storage.GetInfo(ctx, key)
                return err
        })
        if err != nil {
                return snapshots.Info{}, err
        }

        return info, nil
}

func (o *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (_ snapshots.Info, err error) {
        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                info, err = storage.UpdateInfo(ctx, info, fieldpaths...)
                return err
        })
        if err != nil {
                return snapshots.Info{}, err
        }

        return info, nil
}

func (o *snapshotter) Usage(ctx context.Context, key string) (usage snapshots.Usage, err error) {
        var (
                id   string
                info snapshots.Info
        )

        err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                id, info, usage, err = storage.GetInfo(ctx, key)
                return err
        })
        if err != nil {
                return snapshots.Usage{}, err
        }

        if info.Kind == snapshots.KindActive {
                du, err := fs.DiskUsage(ctx, o.getSnapshotDir(id))
                if err != nil {
                        return snapshots.Usage{}, err
                }
                usage = snapshots.Usage(du)
        }

        return usage, nil
}

func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
}

func (o *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return o.createSnapshot(ctx, snapshots.KindView, key, parent, opts)
}

// Mounts returns the mounts for the transaction identified by key. Can be
// called on an read-write or readonly transaction.
//
// This can be used to recover mounts after calling View or Prepare.
func (o *snapshotter) Mounts(ctx context.Context, key string) (_ []mount.Mount, err error) {
        var s storage.Snapshot
        err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                s, err = storage.GetSnapshot(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to get snapshot mount: %w", err)
                }

                return nil
        })
        if err != nil {
                return nil, err
        }

        return o.mounts(s), nil
}

func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
        return o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                id, _, _, err := storage.GetInfo(ctx, key)
                if err != nil {
                        return err
                }

                usage, err := fs.DiskUsage(ctx, o.getSnapshotDir(id))
                if err != nil {
                        return err
                }

                if _, err = storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil {
                        return fmt.Errorf("failed to commit snapshot: %w", err)
                }
                return nil
        })
}

// Remove abandons the transaction identified by key. All resources
// associated with the key will be removed.
func (o *snapshotter) Remove(ctx context.Context, key string) (err error) {
        var (
                renamed, path string
                restore       bool
        )

        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                id, _, err := storage.Remove(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to remove: %w", err)
                }

                path = o.getSnapshotDir(id)
                renamed = filepath.Join(o.root, "snapshots", "rm-"+id)
                if err = os.Rename(path, renamed); err != nil {
                        if !os.IsNotExist(err) {
                                return fmt.Errorf("failed to rename: %w", err)
                        }
                        renamed = ""
                }

                restore = true
                return nil
        })

        if err != nil {
                if renamed != "" && restore {
                        if err1 := os.Rename(renamed, path); err1 != nil {
                                // May cause inconsistent data on disk
                                log.G(ctx).WithError(err1).WithField("path", renamed).Error("failed to rename after failed commit")
                        }
                }
                return err
        }
        if renamed != "" {
                if err := os.RemoveAll(renamed); err != nil {
                        // Must be cleaned up, any "rm-*" could be removed if no active transactions
                        log.G(ctx).WithError(err).WithField("path", renamed).Warnf("failed to remove root filesystem")
                }
        }

        return nil
}

// Walk the committed snapshots.
func (o *snapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
        return o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                return storage.WalkInfo(ctx, fn, fs...)
        })
}

func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) (_ []mount.Mount, err error) {
        var (
                path, td string
                s        storage.Snapshot
        )

        if kind == snapshots.KindActive || parent == "" {
                td, err = os.MkdirTemp(filepath.Join(o.root, "snapshots"), "new-")
                if err != nil {
                        return nil, fmt.Errorf("failed to create temp dir: %w", err)
                }
                if err := os.Chmod(td, 0755); err != nil {
                        return nil, fmt.Errorf("failed to chmod %s to 0755: %w", td, err)
                }
                defer func() {
                        if err != nil {
                                if td != "" {
                                        if err1 := os.RemoveAll(td); err1 != nil {
                                                err = fmt.Errorf("remove failed: %v: %w", err1, err)
                                        }
                                }
                                if path != "" {
                                        if err1 := os.RemoveAll(path); err1 != nil {
                                                err = fmt.Errorf("failed to remove path: %v: %w", err1, err)
                                        }
                                }
                        }
                }()
        }

        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                s, err = storage.CreateSnapshot(ctx, kind, key, parent, opts...)
                if err != nil {
                        return fmt.Errorf("failed to create snapshot: %w", err)
                }

                if td != "" {
                        if len(s.ParentIDs) > 0 {
                                parent := o.getSnapshotDir(s.ParentIDs[0])
                                xattrErrorHandler := func(dst, src, xattrKey string, copyErr error) error {
                                        // security.* xattr cannot be copied in most cases (moby/buildkit#1189)
                                        log.G(ctx).WithError(copyErr).Debugf("failed to copy xattr %q", xattrKey)
                                        return nil
                                }
                                copyDirOpts := []fs.CopyDirOpt{
                                        fs.WithXAttrErrorHandler(xattrErrorHandler),
                                }
                                if err = fs.CopyDir(td, parent, copyDirOpts...); err != nil {
                                        return fmt.Errorf("copying of parent failed: %w", err)
                                }
                        }

                        path = o.getSnapshotDir(s.ID)
                        if err = os.Rename(td, path); err != nil {
                                return fmt.Errorf("failed to rename: %w", err)
                        }
                        td = ""
                }

                return nil
        })
        if err != nil {
                return nil, err
        }

        return o.mounts(s), nil
}

func (o *snapshotter) getSnapshotDir(id string) string {
        return filepath.Join(o.root, "snapshots", id)
}

func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount {
        var (
                roFlag string
                source string
        )

        if s.Kind == snapshots.KindView {
                roFlag = "ro"
        } else {
                roFlag = "rw"
        }

        if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive {
                source = o.getSnapshotDir(s.ID)
        } else {
                source = o.getSnapshotDir(s.ParentIDs[0])
        }

        return []mount.Mount{
                {
                        Source:  source,
                        Type:    mountType,
                        Options: append(defaultMountOptions, roFlag),
                },
        }
}

// Close closes the snapshotter
func (o *snapshotter) Close() error {
        return o.ms.Close()
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package plugin

import (
        "errors"

        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/snapshots/native"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

// Config represents configuration for the native plugin.
type Config struct {
        // Root directory for the plugin
        RootPath string `toml:"root_path"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.SnapshotPlugin,
                ID:     "native",
                Config: &Config{},
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        ic.Meta.Platforms = append(ic.Meta.Platforms, platforms.DefaultSpec())

                        config, ok := ic.Config.(*Config)
                        if !ok {
                                return nil, errors.New("invalid native configuration")
                        }

                        root := ic.Properties[plugins.PropertyRootDir]
                        if len(config.RootPath) != 0 {
                                root = config.RootPath
                        }

                        ic.Meta.Exports[plugins.SnapshotterRootDir] = root
                        return native.NewSnapshotter(root)
                },
        })
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package overlay

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "strings"
        "syscall"

        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/core/snapshots/storage"
        "github.com/containerd/containerd/v2/plugins/snapshots/overlay/overlayutils"
        "github.com/containerd/continuity/fs"
        "github.com/containerd/log"
)

// upperdirKey is a key of an optional label to each snapshot.
// This optional label of a snapshot contains the location of "upperdir" where
// the change set between this snapshot and its parent is stored.
const upperdirKey = "containerd.io/snapshot/overlay.upperdir"

// SnapshotterConfig is used to configure the overlay snapshotter instance
type SnapshotterConfig struct {
        asyncRemove   bool
        upperdirLabel bool
        ms            MetaStore
        mountOptions  []string
        remapIDs      bool
        slowChown     bool
}

// Opt is an option to configure the overlay snapshotter
type Opt func(config *SnapshotterConfig) error

// AsynchronousRemove defers removal of filesystem content until
// the Cleanup method is called. Removals will make the snapshot
// referred to by the key unavailable and make the key immediately
// available for re-use.
func AsynchronousRemove(config *SnapshotterConfig) error {
        config.asyncRemove = true
        return nil
}

// WithUpperdirLabel adds as an optional label
// "containerd.io/snapshot/overlay.upperdir". This stores the location
// of the upperdir that contains the changeset between the labelled
// snapshot and its parent.
func WithUpperdirLabel(config *SnapshotterConfig) error {
        config.upperdirLabel = true
        return nil
}

// WithMountOptions defines the default mount options used for the overlay mount.
// NOTE: Options are not applied to bind mounts.
func WithMountOptions(options []string) Opt {
        return func(config *SnapshotterConfig) error {
                config.mountOptions = append(config.mountOptions, options...)
                return nil
        }
}

type MetaStore interface {
        TransactionContext(ctx context.Context, writable bool) (context.Context, storage.Transactor, error)
        WithTransaction(ctx context.Context, writable bool, fn storage.TransactionCallback) error
        Close() error
}

// WithMetaStore allows the MetaStore to be created outside the snapshotter
// and passed in.
func WithMetaStore(ms MetaStore) Opt {
        return func(config *SnapshotterConfig) error {
                config.ms = ms
                return nil
        }
}

func WithRemapIDs(config *SnapshotterConfig) error {
        config.remapIDs = true
        return nil
}

func WithSlowChown(config *SnapshotterConfig) error {
        config.slowChown = true
        return nil
}

type snapshotter struct {
        root          string
        ms            MetaStore
        asyncRemove   bool
        upperdirLabel bool
        options       []string
        remapIDs      bool
        slowChown     bool
}

// NewSnapshotter returns a Snapshotter which uses overlayfs. The overlayfs
// diffs are stored under the provided root. A metadata file is stored under
// the root.
func NewSnapshotter(root string, opts ...Opt) (snapshots.Snapshotter, error) {
        var config SnapshotterConfig
        for _, opt := range opts {
                if err := opt(&config); err != nil {
                        return nil, err
                }
        }

        if err := os.MkdirAll(root, 0700); err != nil {
                return nil, err
        }
        supportsDType, err := fs.SupportsDType(root)
        if err != nil {
                return nil, err
        }
        if !supportsDType {
                return nil, fmt.Errorf("%s does not support d_type. If the backing filesystem is xfs, please reformat with ftype=1 to enable d_type support", root)
        }
        if config.ms == nil {
                config.ms, err = storage.NewMetaStore(filepath.Join(root, "metadata.db"))
                if err != nil {
                        return nil, err
                }
        }

        if err := os.Mkdir(filepath.Join(root, "snapshots"), 0700); err != nil && !os.IsExist(err) {
                return nil, err
        }

        if !hasOption(config.mountOptions, "userxattr", false) {
                // figure out whether "userxattr" option is recognized by the kernel && needed
                userxattr, err := overlayutils.NeedsUserXAttr(root)
                if err != nil {
                        log.L.WithError(err).Warnf("cannot detect whether \"userxattr\" option needs to be used, assuming to be %v", userxattr)
                }
                if userxattr {
                        config.mountOptions = append(config.mountOptions, "userxattr")
                }
        }

        if !hasOption(config.mountOptions, "index", false) && supportsIndex() {
                config.mountOptions = append(config.mountOptions, "index=off")
        }

        return &snapshotter{
                root:          root,
                ms:            config.ms,
                asyncRemove:   config.asyncRemove,
                upperdirLabel: config.upperdirLabel,
                options:       config.mountOptions,
                remapIDs:      config.remapIDs,
                slowChown:     config.slowChown,
        }, nil
}

func hasOption(options []string, key string, hasValue bool) bool {
        for _, option := range options {
                if hasValue {
                        if strings.HasPrefix(option, key) && len(option) > len(key) && option[len(key)] == '=' {
                                return true
                        }
                } else if option == key {
                        return true
                }
        }
        return false
}

// Stat returns the info for an active or committed snapshot by name or
// key.
//
// Should be used for parent resolution, existence checks and to discern
// the kind of snapshot.
func (o *snapshotter) Stat(ctx context.Context, key string) (info snapshots.Info, err error) {
        var id string
        if err := o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                id, info, _, err = storage.GetInfo(ctx, key)
                return err
        }); err != nil {
                return info, err
        }

        if o.upperdirLabel {
                if info.Labels == nil {
                        info.Labels = make(map[string]string)
                }
                info.Labels[upperdirKey] = o.upperPath(id)
        }
        return info, nil
}

func (o *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (newInfo snapshots.Info, err error) {
        err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                newInfo, err = storage.UpdateInfo(ctx, info, fieldpaths...)
                if err != nil {
                        return err
                }

                if o.upperdirLabel {
                        id, _, _, err := storage.GetInfo(ctx, newInfo.Name)
                        if err != nil {
                                return err
                        }
                        if newInfo.Labels == nil {
                                newInfo.Labels = make(map[string]string)
                        }
                        newInfo.Labels[upperdirKey] = o.upperPath(id)
                }
                return nil
        })
        return newInfo, err
}

// Usage returns the resources taken by the snapshot identified by key.
//
// For active snapshots, this will scan the usage of the overlay "diff" (aka
// "upper") directory and may take some time.
//
// For committed snapshots, the value is returned from the metadata database.
func (o *snapshotter) Usage(ctx context.Context, key string) (_ snapshots.Usage, err error) {
        var (
                usage snapshots.Usage
                info  snapshots.Info
                id    string
        )
        if err := o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                id, info, usage, err = storage.GetInfo(ctx, key)
                return err
        }); err != nil {
                return usage, err
        }

        if info.Kind == snapshots.KindActive {
                upperPath := o.upperPath(id)
                du, err := fs.DiskUsage(ctx, upperPath)
                if err != nil {
                        // TODO(stevvooe): Consider not reporting an error in this case.
                        return snapshots.Usage{}, err
                }
                usage = snapshots.Usage(du)
        }
        return usage, nil
}

func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts)
}

func (o *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) {
        return o.createSnapshot(ctx, snapshots.KindView, key, parent, opts)
}

// Mounts returns the mounts for the transaction identified by key. Can be
// called on an read-write or readonly transaction.
//
// This can be used to recover mounts after calling View or Prepare.
func (o *snapshotter) Mounts(ctx context.Context, key string) (_ []mount.Mount, err error) {
        var s storage.Snapshot
        var info snapshots.Info
        if err := o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                s, err = storage.GetSnapshot(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to get active mount: %w", err)
                }

                _, info, _, err = storage.GetInfo(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to get snapshot info: %w", err)
                }
                return nil
        }); err != nil {
                return nil, err
        }
        return o.mounts(s, info), nil
}

func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
        return o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                // grab the existing id
                id, _, _, err := storage.GetInfo(ctx, key)
                if err != nil {
                        return err
                }

                usage, err := fs.DiskUsage(ctx, o.upperPath(id))
                if err != nil {
                        return err
                }

                if _, err = storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil {
                        return fmt.Errorf("failed to commit snapshot %s: %w", key, err)
                }
                return nil
        })
}

// Remove abandons the snapshot identified by key. The snapshot will
// immediately become unavailable and unrecoverable. Disk space will
// be freed up on the next call to `Cleanup`.
func (o *snapshotter) Remove(ctx context.Context, key string) (err error) {
        var removals []string
        // Remove directories after the transaction is closed, failures must not
        // return error since the transaction is committed with the removal
        // key no longer available.
        defer func() {
                if err == nil {
                        for _, dir := range removals {
                                if err := os.RemoveAll(dir); err != nil {
                                        log.G(ctx).WithError(err).WithField("path", dir).Warn("failed to remove directory")
                                }
                        }
                }
        }()
        return o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                _, _, err = storage.Remove(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to remove snapshot %s: %w", key, err)
                }

                if !o.asyncRemove {
                        removals, err = o.getCleanupDirectories(ctx)
                        if err != nil {
                                return fmt.Errorf("unable to get directories for removal: %w", err)
                        }
                }
                return nil
        })
}

// Walk the snapshots.
func (o *snapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error {
        return o.ms.WithTransaction(ctx, false, func(ctx context.Context) error {
                if o.upperdirLabel {
                        return storage.WalkInfo(ctx, func(ctx context.Context, info snapshots.Info) error {
                                id, _, _, err := storage.GetInfo(ctx, info.Name)
                                if err != nil {
                                        return err
                                }
                                if info.Labels == nil {
                                        info.Labels = make(map[string]string)
                                }
                                info.Labels[upperdirKey] = o.upperPath(id)
                                return fn(ctx, info)
                        }, fs...)
                }
                return storage.WalkInfo(ctx, fn, fs...)
        })
}

// Cleanup cleans up disk resources from removed or abandoned snapshots
func (o *snapshotter) Cleanup(ctx context.Context) error {
        cleanup, err := o.cleanupDirectories(ctx)
        if err != nil {
                return err
        }

        for _, dir := range cleanup {
                if err := os.RemoveAll(dir); err != nil {
                        log.G(ctx).WithError(err).WithField("path", dir).Warn("failed to remove directory")
                }
        }

        return nil
}

func (o *snapshotter) cleanupDirectories(ctx context.Context) (_ []string, err error) {
        var cleanupDirs []string
        // Get a write transaction to ensure no other write transaction can be entered
        // while the cleanup is scanning.
        if err := o.ms.WithTransaction(ctx, true, func(ctx context.Context) error {
                cleanupDirs, err = o.getCleanupDirectories(ctx)
                return err
        }); err != nil {
                return nil, err
        }
        return cleanupDirs, nil
}

func (o *snapshotter) getCleanupDirectories(ctx context.Context) ([]string, error) {
        ids, err := storage.IDMap(ctx)
        if err != nil {
                return nil, err
        }

        snapshotDir := filepath.Join(o.root, "snapshots")
        fd, err := os.Open(snapshotDir)
        if err != nil {
                return nil, err
        }
        defer fd.Close()

        dirs, err := fd.Readdirnames(0)
        if err != nil {
                return nil, err
        }

        cleanup := []string{}
        for _, d := range dirs {
                if _, ok := ids[d]; ok {
                        continue
                }
                cleanup = append(cleanup, filepath.Join(snapshotDir, d))
        }

        return cleanup, nil
}

func validateIDMapping(mapping string) error {
        var (
                hostID int
                ctrID  int
                length int
        )

        if _, err := fmt.Sscanf(mapping, "%d:%d:%d", &ctrID, &hostID, &length); err != nil {
                return err
        }
        // Almost impossible, but snapshots.WithLabels doesn't check it
        if ctrID < 0 || hostID < 0 || length < 0 {
                return fmt.Errorf("invalid mapping \"%d:%d:%d\"", ctrID, hostID, length)
        }
        if ctrID != 0 {
                return fmt.Errorf("container mapping of 0 is only supported")
        }
        return nil
}

func hostID(mapping string) (int, error) {
        var (
                hostID int
                ctrID  int
                length int
        )
        if err := validateIDMapping(mapping); err != nil {
                return -1, fmt.Errorf("invalid mapping: %w", err)
        }
        if _, err := fmt.Sscanf(mapping, "%d:%d:%d", &ctrID, &hostID, &length); err != nil {
                return -1, err
        }
        return hostID, nil
}

func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) (_ []mount.Mount, err error) {
        var (
                s        storage.Snapshot
                td, path string
                info     snapshots.Info
        )

        defer func() {
                if err != nil {
                        if td != "" {
                                if err1 := os.RemoveAll(td); err1 != nil {
                                        log.G(ctx).WithError(err1).Warn("failed to cleanup temp snapshot directory")
                                }
                        }
                        if path != "" {
                                if err1 := os.RemoveAll(path); err1 != nil {
                                        log.G(ctx).WithError(err1).WithField("path", path).Error("failed to reclaim snapshot directory, directory may need removal")
                                        err = fmt.Errorf("failed to remove path: %v: %w", err1, err)
                                }
                        }
                }
        }()

        if err := o.ms.WithTransaction(ctx, true, func(ctx context.Context) (err error) {
                snapshotDir := filepath.Join(o.root, "snapshots")
                td, err = o.prepareDirectory(ctx, snapshotDir, kind)
                if err != nil {
                        return fmt.Errorf("failed to create prepare snapshot dir: %w", err)
                }

                s, err = storage.CreateSnapshot(ctx, kind, key, parent, opts...)
                if err != nil {
                        return fmt.Errorf("failed to create snapshot: %w", err)
                }

                _, info, _, err = storage.GetInfo(ctx, key)
                if err != nil {
                        return fmt.Errorf("failed to get snapshot info: %w", err)
                }

                mappedUID := -1
                mappedGID := -1
                // NOTE: if idmapped mounts' supported by hosted kernel there may be
                // no parents at all, so overlayfs will not work and snapshotter
                // will use bind mount. To be able to create file objects inside the
                // rootfs -- just chown this only bound directory according to provided
                // {uid,gid}map. In case of one/multiple parents -- chown upperdir.
                if v, ok := info.Labels[snapshots.LabelSnapshotUIDMapping]; ok {
                        if mappedUID, err = hostID(v); err != nil {
                                return fmt.Errorf("failed to parse UID mapping: %w", err)
                        }
                }
                if v, ok := info.Labels[snapshots.LabelSnapshotGIDMapping]; ok {
                        if mappedGID, err = hostID(v); err != nil {
                                return fmt.Errorf("failed to parse GID mapping: %w", err)
                        }
                }

                if mappedUID == -1 || mappedGID == -1 {
                        if len(s.ParentIDs) > 0 {
                                st, err := os.Stat(o.upperPath(s.ParentIDs[0]))
                                if err != nil {
                                        return fmt.Errorf("failed to stat parent: %w", err)
                                }
                                stat, ok := st.Sys().(*syscall.Stat_t)
                                if !ok {
                                        return fmt.Errorf("incompatible types after stat call: *syscall.Stat_t expected")
                                }
                                mappedUID = int(stat.Uid)
                                mappedGID = int(stat.Gid)
                        }
                }

                if mappedUID != -1 && mappedGID != -1 {
                        if err := os.Lchown(filepath.Join(td, "fs"), mappedUID, mappedGID); err != nil {
                                return fmt.Errorf("failed to chown: %w", err)
                        }
                }

                path = filepath.Join(snapshotDir, s.ID)
                if err = os.Rename(td, path); err != nil {
                        return fmt.Errorf("failed to rename: %w", err)
                }
                td = ""

                return nil
        }); err != nil {
                return nil, err
        }
        return o.mounts(s, info), nil
}

func (o *snapshotter) prepareDirectory(ctx context.Context, snapshotDir string, kind snapshots.Kind) (string, error) {
        td, err := os.MkdirTemp(snapshotDir, "new-")
        if err != nil {
                return "", fmt.Errorf("failed to create temp dir: %w", err)
        }

        if err := os.Mkdir(filepath.Join(td, "fs"), 0755); err != nil {
                return td, err
        }

        if kind == snapshots.KindActive {
                if err := os.Mkdir(filepath.Join(td, "work"), 0711); err != nil {
                        return td, err
                }
        }

        return td, nil
}

func (o *snapshotter) mounts(s storage.Snapshot, info snapshots.Info) []mount.Mount {
        var options []string

        if o.remapIDs {
                if v, ok := info.Labels[snapshots.LabelSnapshotUIDMapping]; ok {
                        options = append(options, fmt.Sprintf("uidmap=%s", v))
                }
                if v, ok := info.Labels[snapshots.LabelSnapshotGIDMapping]; ok {
                        options = append(options, fmt.Sprintf("gidmap=%s", v))
                }
        }

        if len(s.ParentIDs) == 0 {
                // if we only have one layer/no parents then just return a bind mount as overlay
                // will not work
                roFlag := "rw"
                if s.Kind == snapshots.KindView {
                        roFlag = "ro"
                }
                return []mount.Mount{
                        {
                                Source: o.upperPath(s.ID),
                                Type:   "bind",
                                Options: append(options,
                                        roFlag,
                                        "rbind",
                                ),
                        },
                }
        }

        if s.Kind == snapshots.KindActive {
                options = append(options,
                        fmt.Sprintf("workdir=%s", o.workPath(s.ID)),
                        fmt.Sprintf("upperdir=%s", o.upperPath(s.ID)),
                )
        } else if len(s.ParentIDs) == 1 {
                return []mount.Mount{
                        {
                                Source: o.upperPath(s.ParentIDs[0]),
                                Type:   "bind",
                                Options: append(options,
                                        "ro",
                                        "rbind",
                                ),
                        },
                }
        }

        parentPaths := make([]string, len(s.ParentIDs))
        for i := range s.ParentIDs {
                parentPaths[i] = o.upperPath(s.ParentIDs[i])
        }
        options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":")))
        options = append(options, o.options...)

        return []mount.Mount{
                {
                        Type:    "overlay",
                        Source:  "overlay",
                        Options: options,
                },
        }
}

func (o *snapshotter) upperPath(id string) string {
        return filepath.Join(o.root, "snapshots", id, "fs")
}

func (o *snapshotter) workPath(id string) string {
        return filepath.Join(o.root, "snapshots", id, "work")
}

// Close closes the snapshotter
func (o *snapshotter) Close() error {
        return o.ms.Close()
}

// supportsIndex checks whether the "index=off" option is supported by the kernel.
func supportsIndex() bool {
        if _, err := os.Stat("/sys/module/overlay/parameters/index"); err == nil {
                return true
        }
        return false
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package overlayutils

import (
        "fmt"
        "os"
        "path/filepath"
        "syscall"

        "golang.org/x/sys/unix"

        "github.com/containerd/containerd/v2/core/mount"
        kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
        "github.com/containerd/containerd/v2/pkg/userns"
        "github.com/containerd/continuity/fs"
        "github.com/containerd/log"
)

const (
        // see https://man7.org/linux/man-pages/man2/statfs.2.html
        tmpfsMagic = 0x01021994
)

// SupportsMultipleLowerDir checks if the system supports multiple lowerdirs,
// which is required for the overlay snapshotter. On 4.x kernels, multiple lowerdirs
// are always available (so this check isn't needed), and backported to RHEL and
// CentOS 3.x kernels (3.10.0-693.el7.x86_64 and up). This function is to detect
// support on those kernels, without doing a kernel version compare.
//
// Ported from moby overlay2.
func SupportsMultipleLowerDir(d string) error {
        td, err := os.MkdirTemp(d, "multiple-lowerdir-check")
        if err != nil {
                return err
        }
        defer func() {
                if err := os.RemoveAll(td); err != nil {
                        log.L.WithError(err).Warnf("Failed to remove check directory %v", td)
                }
        }()

        for _, dir := range []string{"lower1", "lower2", "upper", "work", "merged"} {
                if err := os.Mkdir(filepath.Join(td, dir), 0755); err != nil {
                        return err
                }
        }

        opts := fmt.Sprintf("lowerdir=%s:%s,upperdir=%s,workdir=%s", filepath.Join(td, "lower2"), filepath.Join(td, "lower1"), filepath.Join(td, "upper"), filepath.Join(td, "work"))
        m := mount.Mount{
                Type:    "overlay",
                Source:  "overlay",
                Options: []string{opts},
        }
        dest := filepath.Join(td, "merged")
        if err := m.Mount(dest); err != nil {
                return fmt.Errorf("failed to mount overlay: %w", err)
        }
        if err := mount.UnmountAll(dest, 0); err != nil {
                log.L.WithError(err).Warnf("Failed to unmount check directory %v", dest)
        }
        return nil
}

// Supported returns nil when the overlayfs is functional on the system with the root directory.
// Supported is not called during plugin initialization, but exposed for downstream projects which uses
// this snapshotter as a library.
func Supported(root string) error {
        if err := os.MkdirAll(root, 0700); err != nil {
                return err
        }
        supportsDType, err := fs.SupportsDType(root)
        if err != nil {
                return err
        }
        if !supportsDType {
                return fmt.Errorf("%s does not support d_type. If the backing filesystem is xfs, please reformat with ftype=1 to enable d_type support", root)
        }
        return SupportsMultipleLowerDir(root)
}

// IsPathOnTmpfs returns whether the path is on a tmpfs or not.
//
// It uses statfs to check if the fs type is TMPFS_MAGIC (0x01021994)
// see https://man7.org/linux/man-pages/man2/statfs.2.html
func IsPathOnTmpfs(d string) bool {
        stat := syscall.Statfs_t{}
        err := syscall.Statfs(d, &stat)
        if err != nil {
                log.L.WithError(err).Warnf("Could not retrieve statfs for %v", d)
                return false
        }

        return stat.Type == tmpfsMagic
}

// NeedsUserXAttr returns whether overlayfs should be mounted with the "userxattr" mount option.
//
// The "userxattr" option is needed for mounting overlayfs inside a user namespace with kernel >= 5.11.
//
// The "userxattr" option is NOT needed for the initial user namespace (aka "the host").
//
// Also, Ubuntu (since circa 2015) and Debian (since 10) with kernel < 5.11 can mount
// the overlayfs in a user namespace without the "userxattr" option.
//
// The corresponding kernel commit: https://github.com/torvalds/linux/commit/2d2f2d7322ff43e0fe92bf8cccdc0b09449bf2e1
// > ovl: user xattr
// >
// > Optionally allow using "user.overlay." namespace instead of "trusted.overlay."
// > ...
// > Disable redirect_dir and metacopy options, because these would allow privilege escalation through direct manipulation of the
// > "user.overlay.redirect" or "user.overlay.metacopy" xattrs.
// > ...
//
// The "userxattr" support is not exposed in "/sys/module/overlay/parameters".
func NeedsUserXAttr(d string) (bool, error) {
        if !userns.RunningInUserNS() {
                // we are the real root (i.e., the root in the initial user NS),
                // so we do never need "userxattr" opt.
                return false, nil
        }

        // userxattr not permitted on tmpfs https://man7.org/linux/man-pages/man5/tmpfs.5.html
        if IsPathOnTmpfs(d) {
                return false, nil
        }

        // Fast path on kernels >= 5.11
        //
        // Keep in mind that distro vendors might be going to backport the patch to older kernels
        // so we can't completely remove the "slow path".
        fiveDotEleven := kernel.KernelVersion{Kernel: 5, Major: 11}
        if ok, err := kernel.GreaterEqualThan(fiveDotEleven); err == nil && ok {
                return true, nil
        }

        tdRoot := filepath.Join(d, "userxattr-check")
        if err := os.RemoveAll(tdRoot); err != nil {
                log.L.WithError(err).Warnf("Failed to remove check directory %v", tdRoot)
        }

        if err := os.MkdirAll(tdRoot, 0700); err != nil {
                return false, err
        }

        defer func() {
                if err := os.RemoveAll(tdRoot); err != nil {
                        log.L.WithError(err).Warnf("Failed to remove check directory %v", tdRoot)
                }
        }()

        td, err := os.MkdirTemp(tdRoot, "")
        if err != nil {
                return false, err
        }

        for _, dir := range []string{"lower1", "lower2", "upper", "work", "merged"} {
                if err := os.Mkdir(filepath.Join(td, dir), 0755); err != nil {
                        return false, err
                }
        }

        opts := []string{
                "ro",
                fmt.Sprintf("lowerdir=%s:%s,upperdir=%s,workdir=%s", filepath.Join(td, "lower2"), filepath.Join(td, "lower1"), filepath.Join(td, "upper"), filepath.Join(td, "work")),
                "userxattr",
        }

        m := mount.Mount{
                Type:    "overlay",
                Source:  "overlay",
                Options: opts,
        }

        dest := filepath.Join(td, "merged")
        if err := m.Mount(dest); err != nil {
                // Probably the host is running Ubuntu/Debian kernel (< 5.11) with the userns patch but without the userxattr patch.
                // Return false without error.
                log.L.WithError(err).Debugf("cannot mount overlay with \"userxattr\", probably the kernel does not support userxattr")
                return false, nil
        }
        if err := mount.UnmountAll(dest, 0); err != nil {
                log.L.WithError(err).Warnf("Failed to unmount check directory %v", dest)
        }
        return true, nil
}

// SupportsIDMappedMounts tells if this kernel supports idmapped mounts for overlayfs
// or not.
//
// This function returns error whether the kernel supports idmapped mounts
// for overlayfs or not, i.e. if e.g. -ENOSYS may be returned as well as -EPERM.
// So, caller should check for (true, err == nil), otherwise treat it as there's
// no support from the kernel side.
func SupportsIDMappedMounts() (bool, error) {
        // Fast path
        fiveDotNineteen := kernel.KernelVersion{Kernel: 5, Major: 19}
        if ok, err := kernel.GreaterEqualThan(fiveDotNineteen); err == nil && ok {
                return true, nil
        }

        // Do slow path, because idmapped mounts may be backported to older kernels.
        uidMap := syscall.SysProcIDMap{
                ContainerID: 0,
                HostID:      666,
                Size:        1,
        }
        gidMap := syscall.SysProcIDMap{
                ContainerID: 0,
                HostID:      666,
                Size:        1,
        }
        td, err := os.MkdirTemp("", "ovl-idmapped-check")
        if err != nil {
                return false, fmt.Errorf("failed to create check directory: %w", err)
        }
        defer func() {
                if err := os.RemoveAll(td); err != nil {
                        log.L.WithError(err).Warnf("failed to remove check directory %s", td)
                }
        }()

        for _, dir := range []string{"lower", "upper", "work", "merged"} {
                if err = os.Mkdir(filepath.Join(td, dir), 0755); err != nil {
                        return false, fmt.Errorf("failed to create %s directory: %w", dir, err)
                }
        }
        defer func() {
                if err = os.RemoveAll(td); err != nil {
                        log.L.WithError(err).Warnf("failed remove overlay check directory %s", td)
                }
        }()

        if err = os.Lchown(filepath.Join(td, "upper"), uidMap.HostID, gidMap.HostID); err != nil {
                return false, fmt.Errorf("failed to chown upper directory %s: %w", filepath.Join(td, "upper"), err)
        }

        lowerDir := filepath.Join(td, "lower")
        uidmap := fmt.Sprintf("%d:%d:%d", uidMap.ContainerID, uidMap.HostID, uidMap.Size)
        gidmap := fmt.Sprintf("%d:%d:%d", gidMap.ContainerID, gidMap.HostID, gidMap.Size)

        usernsFd, err := mount.GetUsernsFD(uidmap, gidmap)
        if err != nil {
                return false, err
        }
        defer usernsFd.Close()

        if err = mount.IDMapMount(lowerDir, lowerDir, int(usernsFd.Fd())); err != nil {
                return false, fmt.Errorf("failed to remap lowerdir %s: %w", lowerDir, err)
        }
        defer func() {
                if err = unix.Unmount(lowerDir, 0); err != nil {
                        log.L.WithError(err).Warnf("failed to unmount lowerdir %s", lowerDir)
                }
        }()

        opts := fmt.Sprintf("index=off,lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, filepath.Join(td, "upper"), filepath.Join(td, "work"))
        if err = unix.Mount("", filepath.Join(td, "merged"), "overlay", uintptr(unix.MS_RDONLY), opts); err != nil {
                return false, fmt.Errorf("failed to mount idmapped overlay to %s: %w", filepath.Join(td, "merged"), err)
        }
        defer func() {
                if err = unix.Unmount(filepath.Join(td, "merged"), 0); err != nil {
                        log.L.WithError(err).Warnf("failed to unmount overlay check directory %s", filepath.Join(td, "merged"))
                }
        }()

        // NOTE: we can't just return true if mount didn't fail since overlay supports
        // idmappings for {lower,upper}dir. That means we need to check merged directory
        // to make sure it completely  supports idmapped mounts.
        st, err := os.Stat(filepath.Join(td, "merged"))
        if err != nil {
                return false, fmt.Errorf("failed to stat %s: %w", filepath.Join(td, "merged"), err)
        }
        if stat, ok := st.Sys().(*syscall.Stat_t); !ok {
                return false, fmt.Errorf("incompatible types after stat call: *syscall.Stat_t expected")
        } else if int(stat.Uid) != uidMap.HostID || int(stat.Gid) != gidMap.HostID {
                return false, fmt.Errorf("bad mapping: expected {uid: %d, gid: %d}; real {uid: %d, gid: %d}", uidMap.HostID, gidMap.HostID, int(stat.Uid), int(stat.Gid))
        }

        return true, nil
}

//go:build linux

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package overlay

import (
        "errors"

        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/containerd/v2/plugins/snapshots/overlay"
        "github.com/containerd/containerd/v2/plugins/snapshots/overlay/overlayutils"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

const (
        capaRemapIDs     = "remap-ids"
        capaOnlyRemapIDs = "only-remap-ids"
)

// Config represents configuration for the overlay plugin.
type Config struct {
        // Root directory for the plugin
        RootPath      string `toml:"root_path"`
        UpperdirLabel bool   `toml:"upperdir_label"`
        SyncRemove    bool   `toml:"sync_remove"`

        // slowChown allows the plugin to fallback to a recursive chown if fast options (like
        // idmap mounts) are not available. See more info about the overhead this can have in
        // github.com/containerd/containerd/docs/user-namespaces/.
        SlowChown bool `toml:"slow_chown"`

        // MountOptions are options used for the overlay mount (not used on bind mounts)
        MountOptions []string `toml:"mount_options"`
}

func init() {
        registry.Register(&plugin.Registration{
                Type:   plugins.SnapshotPlugin,
                ID:     "overlayfs",
                Config: &Config{},
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        ic.Meta.Platforms = append(ic.Meta.Platforms, platforms.DefaultSpec())

                        config, ok := ic.Config.(*Config)
                        if !ok {
                                return nil, errors.New("invalid overlay configuration")
                        }

                        root := ic.Properties[plugins.PropertyRootDir]
                        if config.RootPath != "" {
                                root = config.RootPath
                        }

                        var oOpts []overlay.Opt
                        if config.UpperdirLabel {
                                oOpts = append(oOpts, overlay.WithUpperdirLabel)
                        }
                        if !config.SyncRemove {
                                oOpts = append(oOpts, overlay.AsynchronousRemove)
                        }

                        if len(config.MountOptions) > 0 {
                                oOpts = append(oOpts, overlay.WithMountOptions(config.MountOptions))
                        }
                        if ok, err := overlayutils.SupportsIDMappedMounts(); err == nil && ok {
                                oOpts = append(oOpts, overlay.WithRemapIDs)
                                ic.Meta.Capabilities = append(ic.Meta.Capabilities, capaRemapIDs)
                        }

                        if config.SlowChown {
                                oOpts = append(oOpts, overlay.WithSlowChown)
                        } else {
                                // If slowChown is false, we use capaOnlyRemapIDs to signal we only
                                // allow idmap mounts.
                                ic.Meta.Capabilities = append(ic.Meta.Capabilities, capaOnlyRemapIDs)
                        }

                        ic.Meta.Exports[plugins.SnapshotterRootDir] = root
                        return overlay.NewSnapshotter(root, oOpts...)
                },
        })
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package streaming

import (
        "context"
        "errors"
        "sync"

        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/streaming"
        "github.com/containerd/containerd/v2/pkg/gc"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"
)

func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.StreamingPlugin,
                ID:   "manager",
                Requires: []plugin.Type{
                        plugins.MetadataPlugin,
                },
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        md, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }

                        sm := &streamManager{
                                streams: map[string]map[string]*managedStream{},
                                byLease: map[string]map[string]map[string]struct{}{},
                        }
                        md.(*metadata.DB).RegisterCollectibleResource(metadata.ResourceStream, sm)
                        return sm, nil
                },
        })
}

type streamManager struct {
        // streams maps namespace -> name -> stream
        streams map[string]map[string]*managedStream

        byLease map[string]map[string]map[string]struct{}

        rwlock sync.RWMutex
}

func (sm *streamManager) Register(ctx context.Context, name string, stream streaming.Stream) error {
        ns, _ := namespaces.Namespace(ctx)
        ls, _ := leases.FromContext(ctx)

        ms := &managedStream{
                Stream:  stream,
                ns:      ns,
                name:    name,
                lease:   ls,
                manager: sm,
        }

        sm.rwlock.Lock()
        defer sm.rwlock.Unlock()
        nsMap, ok := sm.streams[ns]
        if !ok {
                nsMap = make(map[string]*managedStream)
                sm.streams[ns] = nsMap
        }
        if _, ok := nsMap[name]; ok {
                return errdefs.ErrAlreadyExists
        }
        nsMap[name] = ms

        if ls != "" {
                nsMap, ok := sm.byLease[ns]
                if !ok {
                        nsMap = make(map[string]map[string]struct{})
                        sm.byLease[ns] = nsMap
                }
                lsMap, ok := nsMap[ls]
                if !ok {
                        lsMap = make(map[string]struct{})
                        nsMap[ls] = lsMap
                }
                lsMap[name] = struct{}{}
        }
        return nil
}

func (sm *streamManager) Get(ctx context.Context, name string) (streaming.Stream, error) {
        ns, _ := namespaces.Namespace(ctx)
        sm.rwlock.RLock()
        defer sm.rwlock.RUnlock()

        nsMap, ok := sm.streams[ns]
        if !ok {
                return nil, errdefs.ErrNotFound
        }
        stream, ok := nsMap[name]
        if !ok {
                return nil, errdefs.ErrNotFound
        }

        return stream, nil
}

func (sm *streamManager) StartCollection(ctx context.Context) (metadata.CollectionContext, error) {
        // lock now and collection will unlock on cancel or finish
        sm.rwlock.Lock()

        return &collectionContext{
                manager: sm,
        }, nil
}

func (sm *streamManager) ReferenceLabel() string {
        return "stream"
}

type managedStream struct {
        streaming.Stream

        ns      string
        name    string
        lease   string
        manager *streamManager
}

func (m *managedStream) Close() error {
        m.manager.rwlock.Lock()
        if nsMap, ok := m.manager.streams[m.ns]; ok {
                delete(nsMap, m.name)
                if len(nsMap) == 0 {
                        delete(m.manager.streams, m.ns)
                }
        }
        if m.lease != "" {
                if nsMap, ok := m.manager.byLease[m.ns]; ok {
                        if lsMap, ok := nsMap[m.lease]; ok {
                                delete(lsMap, m.name)
                                if len(lsMap) == 0 {
                                        delete(nsMap, m.lease)
                                }
                        }
                        if len(nsMap) == 0 {
                                delete(m.manager.byLease, m.ns)
                        }
                }
        }

        m.manager.rwlock.Unlock()
        return m.Stream.Close()
}

type collectionContext struct {
        manager *streamManager
        removed []gc.Node
}

func (cc *collectionContext) All(fn func(gc.Node)) {
        for ns, nsMap := range cc.manager.streams {
                for name := range nsMap {
                        fn(gc.Node{
                                Type:      metadata.ResourceStream,
                                Namespace: ns,
                                Key:       name,
                        })
                }
        }

}

func (cc *collectionContext) Active(ns string, fn func(gc.Node)) {
        if nsMap, ok := cc.manager.streams[ns]; ok {
                for name, stream := range nsMap {
                        // Don't consider leased streams as active, the lease
                        // will determine the status
                        // TODO: expire non-active streams
                        if stream.lease == "" {
                                fn(gc.Node{
                                        Type:      metadata.ResourceStream,
                                        Namespace: ns,
                                        Key:       name,
                                })
                        }
                }
        }
}

func (cc *collectionContext) Leased(ns, lease string, fn func(gc.Node)) {
        if nsMap, ok := cc.manager.byLease[ns]; ok {
                if lsMap, ok := nsMap[lease]; ok {
                        for name := range lsMap {
                                fn(gc.Node{
                                        Type:      metadata.ResourceStream,
                                        Namespace: ns,
                                        Key:       name,
                                })
                        }
                }
        }
}

func (cc *collectionContext) Remove(n gc.Node) {
        cc.removed = append(cc.removed, n)
}

func (cc *collectionContext) Cancel() error {
        cc.manager.rwlock.Unlock()
        return nil
}

func (cc *collectionContext) Finish() error {
        var closeStreams []streaming.Stream
        for _, node := range cc.removed {
                var lease string
                if nsMap, ok := cc.manager.streams[node.Namespace]; ok {
                        if ms, ok := nsMap[node.Key]; ok {
                                delete(nsMap, node.Key)
                                closeStreams = append(closeStreams, ms.Stream)
                                lease = ms.lease
                        }
                        if len(nsMap) == 0 {
                                delete(cc.manager.streams, node.Namespace)
                        }
                }
                if lease != "" {
                        if nsMap, ok := cc.manager.byLease[node.Namespace]; ok {
                                if lsMap, ok := nsMap[lease]; ok {
                                        delete(lsMap, node.Key)
                                        if len(lsMap) == 0 {
                                                delete(nsMap, lease)
                                        }
                                }
                                if len(nsMap) == 0 {
                                        delete(cc.manager.byLease, node.Namespace)
                                }
                        }
                }
        }
        cc.manager.rwlock.Unlock()

        var errs []error
        for _, s := range closeStreams {
                if err := s.Close(); err != nil {
                        errs = append(errs, err)
                }
        }

        return errors.Join(errs...)
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package transfer

import (
        "fmt"

        "github.com/containerd/containerd/v2/core/diff"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/transfer/local"
        "github.com/containerd/containerd/v2/core/unpack"
        "github.com/containerd/containerd/v2/pkg/imageverifier"
        "github.com/containerd/containerd/v2/plugins"
        "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/containerd/plugin"
        "github.com/containerd/plugin/registry"

        // Load packages with type registrations
        _ "github.com/containerd/containerd/v2/core/transfer/archive"
        _ "github.com/containerd/containerd/v2/core/transfer/image"
        _ "github.com/containerd/containerd/v2/core/transfer/registry"
)

// Register local transfer service plugin
func init() {
        registry.Register(&plugin.Registration{
                Type: plugins.TransferPlugin,
                ID:   "local",
                Requires: []plugin.Type{
                        plugins.LeasePlugin,
                        plugins.MetadataPlugin,
                        plugins.DiffPlugin,
                        plugins.ImageVerifierPlugin,
                },
                Config: defaultConfig(),
                InitFn: func(ic *plugin.InitContext) (interface{}, error) {
                        config := ic.Config.(*transferConfig)
                        m, err := ic.GetSingle(plugins.MetadataPlugin)
                        if err != nil {
                                return nil, err
                        }
                        ms := m.(*metadata.DB)

                        var lc local.TransferConfig

                        l, err := ic.GetSingle(plugins.LeasePlugin)
                        if err != nil {
                                return nil, err
                        }
                        lc.Leases = l.(leases.Manager)

                        vps, err := ic.GetByType(plugins.ImageVerifierPlugin)
                        if err != nil {
                                return nil, err
                        }
                        if len(vps) > 0 {
                                lc.Verifiers = make(map[string]imageverifier.ImageVerifier)
                                for name, vp := range vps {
                                        lc.Verifiers[name] = vp.(imageverifier.ImageVerifier)
                                }
                        }

                        // Set configuration based on default or user input
                        lc.MaxConcurrentDownloads = config.MaxConcurrentDownloads
                        lc.MaxConcurrentUploadedLayers = config.MaxConcurrentUploadedLayers

                        // If UnpackConfiguration is not defined, set the default.
                        // If UnpackConfiguration is defined and empty, ignore.
                        if config.UnpackConfiguration == nil {
                                config.UnpackConfiguration = defaultUnpackConfig()
                        }
                        for _, uc := range config.UnpackConfiguration {
                                p, err := platforms.Parse(uc.Platform)
                                if err != nil {
                                        return nil, fmt.Errorf("%s: platform configuration %v invalid", plugins.TransferPlugin, uc.Platform)
                                }

                                sn := ms.Snapshotter(uc.Snapshotter)
                                if sn == nil {
                                        return nil, fmt.Errorf("snapshotter %q not found: %w", uc.Snapshotter, errdefs.ErrNotFound)
                                }

                                var applier diff.Applier
                                target := platforms.Only(p)
                                if uc.Differ != "" {
                                        inst, err := ic.GetByID(plugins.DiffPlugin, uc.Differ)
                                        if err != nil {
                                                return nil, fmt.Errorf("failed to get instance for diff plugin %q: %w", uc.Differ, err)
                                        }
                                        applier = inst.(diff.Applier)
                                } else {
                                        for name, plugin := range ic.GetAll() {
                                                if plugin.Registration.Type != plugins.DiffPlugin {
                                                        continue
                                                }
                                                var matched bool
                                                for _, p := range plugin.Meta.Platforms {
                                                        if target.Match(p) {
                                                                matched = true
                                                        }
                                                }
                                                if !matched {
                                                        continue
                                                }
                                                if applier != nil {
                                                        log.G(ic.Context).Warnf("multiple differs match for platform, set `differ` option to choose, skipping %q", plugin.Registration.ID)
                                                        continue
                                                }
                                                inst, err := plugin.Instance()
                                                if err != nil {
                                                        return nil, fmt.Errorf("failed to get instance for diff plugin %q: %w", name, err)
                                                }
                                                applier = inst.(diff.Applier)
                                        }
                                }
                                if applier == nil {
                                        return nil, fmt.Errorf("no matching diff plugins: %w", errdefs.ErrNotFound)
                                }

                                up := unpack.Platform{
                                        Platform:       target,
                                        SnapshotterKey: uc.Snapshotter,
                                        Snapshotter:    sn,
                                        Applier:        applier,
                                }
                                lc.UnpackPlatforms = append(lc.UnpackPlatforms, up)
                        }
                        lc.RegistryConfigPath = config.RegistryConfigPath

                        return local.NewTransferService(ms.ContentStore(), metadata.NewImageStore(ms), lc), nil
                },
        })
}

type transferConfig struct {
        // MaxConcurrentDownloads is the max concurrent content downloads for pull.
        MaxConcurrentDownloads int `toml:"max_concurrent_downloads"`

        // MaxConcurrentUploadedLayers is the max concurrent uploads for push
        MaxConcurrentUploadedLayers int `toml:"max_concurrent_uploaded_layers"`

        // UnpackConfiguration is used to read config from toml
        UnpackConfiguration []unpackConfiguration `toml:"unpack_config,omitempty"`

        // RegistryConfigPath is a path to the root directory containing registry-specific configurations
        RegistryConfigPath string `toml:"config_path"`
}

type unpackConfiguration struct {
        // Platform is the target unpack platform to match
        Platform string `toml:"platform"`

        // Snapshotter is the snapshotter to use to unpack
        Snapshotter string `toml:"snapshotter"`

        // Differ is the diff plugin to be used for apply
        Differ string `toml:"differ"`
}

func defaultConfig() *transferConfig {
        return &transferConfig{
                MaxConcurrentDownloads:      3,
                MaxConcurrentUploadedLayers: 3,
        }
}

//go:build !windows

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package transfer

import (
        "github.com/containerd/containerd/v2/defaults"
        "github.com/containerd/platforms"
)

func defaultUnpackConfig() []unpackConfiguration {
        return []unpackConfiguration{
                {
                        Platform:    platforms.Format(platforms.DefaultSpec()),
                        Snapshotter: defaults.DefaultSnapshotter,
                },
        }
}